From cc12b4e8a79a52567a2b1448c160d62493b56b21 Mon Sep 17 00:00:00 2001 From: ArronHZG Date: Thu, 24 Jul 2025 16:33:08 +0800 Subject: [PATCH 001/182] init async training pipline --- .../fully_async_policy/README_fully_async.md | 183 +++++++ .../config/fully_async_ppo_trainer.yaml | 136 +++++ recipe/fully_async_policy/fully_async_main.py | 244 +++++++++ .../fully_async_policy/fully_async_trainer.py | 490 ++++++++++++++++++ recipe/fully_async_policy/message_queue.py | 238 +++++++++ recipe/fully_async_policy/param_sync.py | 175 +++++++ recipe/fully_async_policy/rollouter.py | 414 +++++++++++++++ .../run_fully_async_example.sh | 149 ++++++ recipe/fully_async_policy/test_fully_async.py | 197 +++++++ tests/special_sanity/check_license.py | 4 +- 10 files changed, 2229 insertions(+), 1 deletion(-) create mode 100644 recipe/fully_async_policy/README_fully_async.md create mode 100644 recipe/fully_async_policy/config/fully_async_ppo_trainer.yaml create mode 100644 recipe/fully_async_policy/fully_async_main.py create mode 100644 recipe/fully_async_policy/fully_async_trainer.py create mode 100644 recipe/fully_async_policy/message_queue.py create mode 100644 recipe/fully_async_policy/param_sync.py create mode 100644 recipe/fully_async_policy/rollouter.py create mode 100644 recipe/fully_async_policy/run_fully_async_example.sh create mode 100644 recipe/fully_async_policy/test_fully_async.py diff --git a/recipe/fully_async_policy/README_fully_async.md b/recipe/fully_async_policy/README_fully_async.md new file mode 100644 index 00000000000..979f9aff783 --- /dev/null +++ b/recipe/fully_async_policy/README_fully_async.md @@ -0,0 +1,183 @@ +# 完全异步训练工作流 (Fully Async Training Workflow) + +## 概述 + +本项目实现了基于现有 one step off policy 代码的完全异步训练工作流,将样本生成(Rollouter)和模型训练(Trainer)完全解耦,通过 MessageQueue 进行异步通信。 + +## 架构设计 + +### 核心组件 + +1. **MessageQueue**: 基于 ZeroMQ 的异步消息队列,作为 Ray Actor 存在 + - 管理生成的样本队列 + - 支持新鲜度控制,自动丢弃过期样本 + - 提供线程安全的生产者-消费者接口 + +2. **Rollouter**: 专门负责样本生成的组件 + - 持续循环生成训练样本 + - 支持暂停/恢复机制,用于参数更新 + - 实现新鲜度阈值控制,避免生成过多过期样本 + +3. **FullyAsyncTrainer**: 修改后的训练器 + - 从 MessageQueue 获取样本进行训练 + - 训练完成后通知 Rollouter 更新参数 + - 支持样本新鲜度监控和统计 + +4. **ParameterSynchronizer**: 参数同步模块 + - 基于 NCCL 实现高效的参数同步 + - 支持 Actor 到 Rollout 的参数传递 + +### 工作流程 + +``` +┌─────────────┐ put_batch ┌──────────────┐ get_batch ┌─────────────┐ +│ Rollouter │ ──────────────► │ MessageQueue │ ──────────────► │ Trainer │ +│ │ │ │ │ │ +│ - 生成样本 │ │ - 队列管理 │ │ - 模型训练 │ +│ - 暂停/恢复 │ │ - 新鲜度控制 │ │ - 参数更新 │ +│ - 新鲜度控制 │ │ - 统计信息 │ │ - 同步通知 │ +└─────────────┘ └──────────────┘ └─────────────┘ + ▲ │ + │ update_rollout_weights │ + └─────────────────────────────────────────────────────────────────┘ +``` + +## 新鲜度控制机制 + +### 配置参数 + +- `freshness_threshold`: 新鲜度阈值,队列中超过此版本差异的样本会被丢弃 +- `max_staleness_allowed`: 最大允许的新鲜度差异,Rollouter 会暂停生成 +- `max_queue_size`: MessageQueue 的最大队列大小 + +### 控制逻辑 + +1. **样本丢弃**: 当样本的参数版本与当前 Trainer 版本差异超过 `freshness_threshold` 时,样本被丢弃 +2. **生成暂停**: 当 Rollouter 的参数版本与 Trainer 版本差异超过 `max_staleness_allowed` 时,暂停生成 +3. **队列管理**: 队列长度限制为 `freshness_threshold * batch_size`,避免内存溢出 + +## 性能优势 + +### 相比同步训练 + +- **GPU 利用率提升**: 生成和训练并行进行,减少 GPU 空闲时间 +- **长尾样本优化**: 训练不需要等待最慢的样本生成完成 +- **资源隔离**: 可以独立配置生成和训练的资源分配 + +### 相比 One Step Off Policy + +- **更高的异步度**: 完全解耦生成和训练,支持多步异步 +- **更灵活的控制**: 支持动态的新鲜度控制和队列管理 +- **更好的监控**: 提供详细的统计信息和性能指标 + +## 使用方法 + +### 1. 安装依赖 + +```bash +pip install zmq filelock +``` + +### 2. 配置文件 + +使用 `config/fully_async_ppo_trainer.yaml` 配置文件,关键配置项: + +```yaml +async_training: + freshness_threshold: 3 # 新鲜度阈值 + max_staleness_allowed: 5 # 最大允许新鲜度差异 + max_queue_size: 1000 # 队列最大大小 + min_batch_count: 1 # 最小batch数量 + batch_timeout: 30.0 # 获取batch超时时间 + +actor_rollout_ref: + rollout: + mode: async # 使用异步模式 + n_gpus: 4 # rollout专用GPU数量 + name: vllm # 使用vLLM引擎 +``` + +### 3. 启动训练 + +```bash +python -m recipe.one_step_off_policy.fully_async_main \ + data.train_files=~/data/train.parquet \ + data.val_files=~/data/val.parquet \ + actor_rollout_ref.model.path=Qwen/Qwen2-7B-Instruct \ + trainer.total_training_steps=1000 +``` + +### 4. 监控训练 + +训练过程中会输出以下统计信息: + +- `queue_size`: 当前队列大小 +- `avg_sample_age`: 平均样本年龄(参数版本差异) +- `max_sample_age`: 最大样本年龄 +- `param_version`: 当前参数版本 +- `processed_samples`: 已处理样本数 +- `dropped_samples`: 丢弃的过期样本数 + +## 性能调优建议 + +### 1. 资源分配 + +- **生成资源**: 根据模型大小和生成速度需求分配 GPU +- **训练资源**: 根据batch大小和训练复杂度分配 GPU +- **比例建议**: 生成:训练 = 1:2 到 1:3 + +### 2. 新鲜度控制 + +- **快速生成场景**: 降低 `freshness_threshold` (2-3) +- **慢速生成场景**: 提高 `freshness_threshold` (5-8) +- **队列大小**: 设置为 `freshness_threshold * batch_size * 2` + +### 3. 网络优化 + +- **单节点**: MessageQueue 使用 IPC 协议 +- **多节点**: MessageQueue 使用 TCP 协议,注意网络带宽 + +## 故障排除 + +### 常见问题 + +1. **队列为空**: 检查 Rollouter 是否正常运行,是否被新鲜度控制暂停 +2. **内存溢出**: 减少 `max_queue_size` 或增加 `freshness_threshold` +3. **参数同步失败**: 检查 NCCL 配置和网络连接 +4. **性能下降**: 调整资源分配比例,监控 GPU 利用率 + +### 调试模式 + +设置环境变量启用详细日志: + +```bash +export VERL_LOGGING_LEVEL=DEBUG +export NCCL_DEBUG=INFO +``` + +## 与现有系统对比 + +| 特性 | 同步训练 | One Step Off | 完全异步 | +|------|----------|--------------|----------| +| 异步程度 | 无 | 一步 | 多步 | +| 资源利用率 | 低 | 中 | 高 | +| 实现复杂度 | 低 | 中 | 高 | +| 样本新鲜度 | 最新 | 一步延迟 | 可控延迟 | +| 内存使用 | 低 | 中 | 中-高 | + +## 实验结果预期 + +基于现有 one step off policy 的实验结果,完全异步训练预期能够: + +- **训练速度**: 相比同步训练提升 30-50% +- **GPU 利用率**: 提升至 85-95% +- **内存开销**: 增加 20-30%(主要用于队列缓存) +- **模型收敛**: 与同步训练基本一致(在合理的新鲜度控制下) + +## 后续改进 + +1. **自适应新鲜度控制**: 根据训练进度动态调整新鲜度阈值 +2. **多队列支持**: 支持不同优先级的样本队列 +3. **分布式队列**: 支持跨节点的分布式消息队列 +4. **更精细的资源调度**: 支持动态的资源分配和调整 + diff --git a/recipe/fully_async_policy/config/fully_async_ppo_trainer.yaml b/recipe/fully_async_policy/config/fully_async_ppo_trainer.yaml new file mode 100644 index 00000000000..cbc7058f108 --- /dev/null +++ b/recipe/fully_async_policy/config/fully_async_ppo_trainer.yaml @@ -0,0 +1,136 @@ +hydra: + searchpath: + - file://verl/trainer/config + +defaults: + - ppo_trainer + - _self_ + +# 完全异步训练的特殊配置 +async_training: + # 新鲜度阈值,超过此版本差异的样本会被丢弃 + freshness_threshold: 3 + + # 最大允许的新鲜度差异,rollout会暂停生成 + max_staleness_allowed: 5 + + # MessageQueue的最大队列大小 + max_queue_size: 1000 + + # 最小batch数量,trainer会等待至少这么多batch + min_batch_count: 1 + + # 获取batch的超时时间(秒) + batch_timeout: 30.0 + +# 重写默认的训练配置 +actor_rollout_ref: + hybrid_engine: false + rollout: + # 异步模式 + mode: async + + # rollout专用的GPU数量 + n_gpus: 4 + + # 使用vLLM异步rollout + name: vllm + + # 其他rollout参数 + temperature: 1.0 + top_k: -1 + top_p: 1.0 + tensor_model_parallel_size: 2 + gpu_memory_utilization: 0.6 + max_num_batched_tokens: 8192 + free_cache_engine: true + enforce_eager: true + +# 训练器配置 +trainer: + # 总训练步数 + total_training_steps: 1000 + + # 设备 + device: cuda + + # 保存频率 + save_freq: 100 + + # 验证频率 + val_freq: 50 + + # 日志配置 + logger: '["console", "wandb"]' + project_name: "fully_async_ppo" + experiment_name: "test_async_training" + +# 数据配置 +data: + # 训练batch大小 + train_batch_size: 128 + + # 数据文件路径 + train_files: "~/data/train.parquet" + val_files: "~/data/val.parquet" + + # 序列长度 + max_prompt_length: 1024 + max_response_length: 1024 + +# 算法配置 +algorithm: + # 优势估计器 + adv_estimator: gae + + # PPO参数 + cliprange: 0.2 + cliprange_value: 0.2 + vf_coeff: 0.1 + entropy_coeff: 0.01 + + # KL相关 + kl_coeff: 0.1 + adaptive_kl: true + target_kl: 0.01 + +# 模型配置 +actor_rollout_ref: + model: + # 模型路径 + path: "Qwen/Qwen2-7B-Instruct" + + # 使用LoRA + lora_rank: 64 + lora_alpha: 128 + lora_dropout: 0.1 + + actor: + # Actor优化器 + optim: + lr: 1e-6 + weight_decay: 0.01 + + # FSDP配置 + fsdp_config: + fsdp_size: -1 + param_offload: false + optimizer_offload: false + + # PPO配置 + ppo_mini_batch_size: 32 + use_dynamic_bsz: true + +# Critic配置 +critic: + model: + path: "Qwen/Qwen2-7B-Instruct" + + optim: + lr: 1e-5 + weight_decay: 0.01 + + fsdp_config: + fsdp_size: -1 + param_offload: false + diff --git a/recipe/fully_async_policy/fully_async_main.py b/recipe/fully_async_policy/fully_async_main.py new file mode 100644 index 00000000000..3bab5d91eb1 --- /dev/null +++ b/recipe/fully_async_policy/fully_async_main.py @@ -0,0 +1,244 @@ +# Copyright 2025 Meituan Ltd. and/or its affiliates +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import logging +import os +import threading +import time + +import hydra +import ray + +from recipe.fully_async_policy.message_queue import MessageQueue, MessageQueueClient +from recipe.fully_async_policy.rollouter import Rollouter +from verl.trainer.main_ppo import create_rl_dataset, create_rl_sampler +from verl.trainer.ppo.reward import load_reward_manager + +from .fully_async_trainer import FullyAsyncTrainer + +logger = logging.getLogger(__name__) + + +def setup_logging(): + """设置日志配置""" + logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(name)s - %(levelname)s - %(message)s") + + +@ray.remote +class RollouterActor: + """Rollouter的Ray Actor包装器""" + + def __init__( + self, + config, + tokenizer, + role_worker_mapping, + resource_pool_manager, + ray_worker_group_cls, + processor=None, + train_dataset=None, + collate_fn=None, + train_sampler=None, + device_name="cuda", + ): + self.rollouter = Rollouter( + config=config, + tokenizer=tokenizer, + role_worker_mapping=role_worker_mapping, + resource_pool_manager=resource_pool_manager, + ray_worker_group_cls=ray_worker_group_cls, + processor=processor, + train_dataset=train_dataset, + collate_fn=collate_fn, + train_sampler=train_sampler, + device_name=device_name, + ) + + def init_workers(self): + """初始化worker""" + return self.rollouter.init_workers() + + def set_message_queue_client(self, message_queue_client): + """设置消息队列客户端""" + return self.rollouter.set_message_queue_client(message_queue_client) + + def set_parameter_synchronizer(self, param_synchronizer): + """设置参数同步器""" + return self.rollouter.set_parameter_synchronizer(param_synchronizer) + + def update_rollout_weights(self, param_version: int): + """更新rollout权重""" + return self.rollouter.update_rollout_weights(param_version) + + def fit(self): + """开始生成循环""" + return self.rollouter.fit() + + def shutdown(self): + """关闭rollouter""" + return self.rollouter.shutdown() + + def get_statistics(self): + """获取统计信息""" + return self.rollouter.get_statistics() + + +def run_fully_async_ppo(config): + """运行完全异步的PPO训练""" + setup_logging() + + logger.info("Starting fully async PPO training...") + + # 初始化Ray + if not ray.is_initialized(): + ray.init( + address=os.environ.get("RAY_ADDRESS", None), + runtime_env={"env_vars": {"NCCL_DEBUG": "WARN", "VLLM_USE_V1": "1"}}, + ) + + try: + # 创建数据集和采样器 + logger.info("Creating dataset and sampler...") + from verl.utils import hf_processor, hf_tokenizer + + tokenizer = hf_tokenizer(config.actor_rollout_ref.model.path) + processor = hf_processor(config.actor_rollout_ref.model.path) + + train_dataset, val_dataset = create_rl_dataset(config, tokenizer, processor) + train_sampler = create_rl_sampler(config, train_dataset) + + # 创建collate function + from verl.trainer.ppo.ray_trainer import default_collate_fn + + collate_fn = default_collate_fn + + # 创建奖励函数 + reward_fn, val_reward_fn = load_reward_manager(config, tokenizer) + + # 创建资源池管理器和worker映射 + from verl.single_controller.ray import RayWorkerGroup + from verl.trainer.ppo.ray_trainer import ( + Role, + create_resource_pool_manager, + create_role_worker_mapping, + ) + + # resource_pool_manager = create_resource_pool_manager(config) + role_worker_mapping = create_role_worker_mapping(config) + + # 1. 创建MessageQueue + logger.info("Creating MessageQueue...") + max_queue_size = config.async_training.get("max_queue_size", 1000) + message_queue = MessageQueue.remote(config, max_queue_size) + message_queue_client = MessageQueueClient(message_queue) + + # 2. 创建Rollouter Actor + logger.info("Creating Rollouter...") + rollouter_actor = RollouterActor.remote( + config=config, + tokenizer=tokenizer, + role_worker_mapping={Role.Rollout: role_worker_mapping[Role.Rollout]}, + resource_pool_manager=create_resource_pool_manager(config, roles=[Role.Rollout]), + ray_worker_group_cls=RayWorkerGroup, + processor=processor, + train_dataset=train_dataset, + collate_fn=collate_fn, + train_sampler=train_sampler, + device_name=config.trainer.device, + ) + + # 初始化Rollouter + ray.get(rollouter_actor.init_workers.remote()) + ray.get(rollouter_actor.set_message_queue_client.remote(message_queue_client)) + + # 3. 创建Trainer + logger.info("Creating FullyAsyncTrainer...") + trainer_role_mapping = { + role: worker_cls for role, worker_cls in role_worker_mapping.items() if role != Role.Rollout + } + + trainer = FullyAsyncTrainer( + config=config, + tokenizer=tokenizer, + role_worker_mapping=trainer_role_mapping, + resource_pool_manager=create_resource_pool_manager(config, roles=list(trainer_role_mapping.keys())), + ray_worker_group_cls=RayWorkerGroup, + processor=processor, + reward_fn=reward_fn, + val_reward_fn=val_reward_fn, + train_dataset=train_dataset, + val_dataset=val_dataset, + collate_fn=collate_fn, + train_sampler=train_sampler, + device_name=config.trainer.device, + ) + + # 初始化Trainer + trainer.init_workers() + trainer.set_message_queue_client(message_queue_client) + trainer.set_rollouter_actor(rollouter_actor) + + # 4. 设置参数同步 + logger.info("Setting up parameter synchronization...") + # param_synchronizer = AsyncParameterSynchronizer( + # config=config, actor_wg=trainer.actor_wg, rollouter_actor=rollouter_actor + # ) + + # 5. 启动Rollouter(在后台线程中) + logger.info("Starting Rollouter in background...") + + def run_rollouter(): + try: + ray.get(rollouter_actor.fit.remote()) + except Exception as e: + logger.error(f"Rollouter error: {e}") + + rollouter_thread = threading.Thread(target=run_rollouter, daemon=True) + rollouter_thread.start() + + # 等待一下让Rollouter启动 + time.sleep(5) + + # 6. 启动Trainer(主线程) + logger.info("Starting FullyAsyncTrainer...") + trainer.fit() + + # 7. 关闭 + logger.info("Shutting down...") + ray.get(rollouter_actor.shutdown.remote()) + + # 等待Rollouter线程结束 + rollouter_thread.join(timeout=10) + + # 关闭MessageQueue + ray.get(message_queue.shutdown.remote()) + + logger.info("Fully async PPO training completed successfully!") + + except Exception as e: + logger.error(f"Error in fully async PPO training: {e}") + raise + finally: + if ray.is_initialized(): + ray.shutdown() + + +@hydra.main(config_path="../one_step_off_policy/config", config_name="fully_async_ppo_trainer", version_base=None) +def main(config): + """主入口函数""" + run_fully_async_ppo(config) + + +if __name__ == "__main__": + main() diff --git a/recipe/fully_async_policy/fully_async_trainer.py b/recipe/fully_async_policy/fully_async_trainer.py new file mode 100644 index 00000000000..192d33817a6 --- /dev/null +++ b/recipe/fully_async_policy/fully_async_trainer.py @@ -0,0 +1,490 @@ +# Copyright 2025 Meituan Ltd. and/or its affiliates +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import logging +from pprint import pprint +from typing import Optional + +import numpy as np +import ray +from omegaconf import OmegaConf +from torch.utils.data import Dataset, Sampler +from tqdm import tqdm + +from recipe.fully_async_policy.message_queue import BatchSample, MessageQueueClient +from verl import DataProto +from verl.single_controller.ray import RayClassWithInitArgs, RayWorkerGroup +from verl.single_controller.ray.base import create_colocated_worker_cls +from verl.trainer.ppo.metric_utils import ( + compute_data_metrics, + compute_throughout_metrics, + compute_timing_metrics, +) +from verl.trainer.ppo.ray_trainer import ( + ResourcePoolManager, + Role, + WorkerType, + apply_kl_penalty, + compute_advantage, + compute_response_mask, +) +from verl.trainer.ppo.reward import compute_reward, compute_reward_async +from verl.utils.debug import marked_timer +from verl.utils.metric import reduce_metrics +from verl.utils.tracking import ValidationGenerationsLogger + +logger = logging.getLogger(__name__) + + +class FullyAsyncTrainer: + """ + 完全异步的PPO训练器,从MessageQueue获取样本进行训练 + """ + + def __init__( + self, + config, + tokenizer, + role_worker_mapping: dict[Role, WorkerType], + resource_pool_manager: ResourcePoolManager, + ray_worker_group_cls: RayWorkerGroup = RayWorkerGroup, + processor=None, + reward_fn=None, + val_reward_fn=None, + train_dataset: Optional[Dataset] = None, + val_dataset: Optional[Dataset] = None, + collate_fn=None, + train_sampler: Optional[Sampler] = None, + device_name="cuda", + ): + self.config = config + self.tokenizer = tokenizer + self.processor = processor + self.reward_fn = reward_fn + self.val_reward_fn = val_reward_fn + + self.role_worker_mapping = role_worker_mapping + self.resource_pool_manager = resource_pool_manager + self.ray_worker_group_cls = ray_worker_group_cls + self.device_name = device_name + self.validation_generations_logger = ValidationGenerationsLogger() + + # 数据相关 + self.train_dataset = train_dataset + self.val_dataset = val_dataset + self.collate_fn = collate_fn + self.train_sampler = train_sampler + + # 角色配置 + self.use_reference_policy = Role.RefPolicy in role_worker_mapping + self.use_rm = Role.RewardModel in role_worker_mapping + self.use_critic = Role.Critic in role_worker_mapping + self.ref_in_actor = config.actor_rollout_ref.model.get("lora_rank", 0) > 0 + + # Worker groups + self.actor_wg = None + self.critic_wg = None + self.ref_policy_wg = None + self.rm_wg = None + + # 训练状态 + self.global_steps = 0 + self.current_param_version = 0 + self.total_training_steps = config.trainer.total_training_steps + + # MessageQueue客户端 + self.message_queue_client = None + + # 与Rollouter的通信 + self.rollouter_actor = None + + # 统计信息 + self.processed_samples = 0 + self.stale_samples_processed = 0 + + def set_message_queue_client(self, message_queue_client: MessageQueueClient): + """设置消息队列客户端""" + self.message_queue_client = message_queue_client + + def set_rollouter_actor(self, rollouter_actor): + """设置Rollouter Actor的引用""" + self.rollouter_actor = rollouter_actor + + def init_workers(self): + """初始化训练workers""" + logger.info("Initializing FullyAsyncTrainer workers...") + + self.resource_pool_manager.create_resource_pool() + self.resource_pool_to_cls = {pool: {} for pool in self.resource_pool_manager.resource_pool_dict.values()} + + # 创建actor worker + actor_resource_pool = self.resource_pool_manager.get_resource_pool(Role.Actor) + actor_cls = RayClassWithInitArgs( + cls=self.role_worker_mapping[Role.Actor], + config=self.config.actor_rollout_ref, + role="actor", + ) + self.resource_pool_to_cls[actor_resource_pool]["actor"] = actor_cls + + # 创建critic worker + if self.use_critic: + critic_resource_pool = self.resource_pool_manager.get_resource_pool(Role.Critic) + critic_cls = RayClassWithInitArgs(cls=self.role_worker_mapping[Role.Critic], config=self.config.critic) + self.resource_pool_to_cls[critic_resource_pool]["critic"] = critic_cls + + # 创建reference policy worker + if self.use_reference_policy: + ref_resource_pool = self.resource_pool_manager.get_resource_pool(Role.RefPolicy) + ref_policy_cls = RayClassWithInitArgs( + cls=self.role_worker_mapping[Role.RefPolicy], + config=self.config.actor_rollout_ref, + role="ref", + ) + self.resource_pool_to_cls[ref_resource_pool]["ref"] = ref_policy_cls + + # 创建reward model worker + if self.use_rm: + rm_resource_pool = self.resource_pool_manager.get_resource_pool(Role.RewardModel) + rm_cls = RayClassWithInitArgs( + cls=self.role_worker_mapping[Role.RewardModel], config=self.config.reward_model + ) + self.resource_pool_to_cls[rm_resource_pool]["rm"] = rm_cls + + # 初始化WorkerGroup + all_wg = {} + wg_kwargs = {} + if OmegaConf.select(self.config.trainer, "ray_wait_register_center_timeout") is not None: + wg_kwargs["ray_wait_register_center_timeout"] = self.config.trainer.ray_wait_register_center_timeout + + for resource_pool, class_dict in self.resource_pool_to_cls.items(): + worker_dict_cls = create_colocated_worker_cls(class_dict=class_dict) + wg_dict = self.ray_worker_group_cls( + resource_pool=resource_pool, + ray_cls_with_init=worker_dict_cls, + device_name=self.device_name, + **wg_kwargs, + ) + spawn_wg = wg_dict.spawn(prefix_set=class_dict.keys()) + all_wg.update(spawn_wg) + + # 分配worker groups + self.actor_wg = all_wg["actor"] + self.actor_wg.init_model() + + if self.use_critic: + self.critic_wg = all_wg["critic"] + self.critic_wg.init_model() + + if self.use_reference_policy and not self.ref_in_actor: + self.ref_policy_wg = all_wg["ref"] + self.ref_policy_wg.init_model() + + if self.use_rm: + self.rm_wg = all_wg["rm"] + self.rm_wg.init_model() + + logger.info("FullyAsyncTrainer workers initialized successfully") + + def _load_checkpoint(self): + """加载检查点""" + # 简化的检查点加载逻辑 + pass + + def _validate(self): + """执行验证""" + if self.val_reward_fn is None: + return None + + # 简化的验证逻辑 + logger.info("Validation step skipped in async trainer") + return {"val_reward": 0.0} + + def _save_checkpoint(self): + """保存检查点""" + # 简化的检查点保存逻辑 + pass + + def _dump_generations(self, inputs, outputs, scores, reward_extra_infos_dict, dump_path): + """保存生成结果""" + # 简化的生成结果保存逻辑 + pass + + def _update_param_version_and_sync(self): + """更新参数版本并同步到Rollouter""" + self.current_param_version += 1 + + # 通知MessageQueue更新参数版本 + self.message_queue_client.update_param_version(self.current_param_version) + + # 通知Rollouter更新参数 + if self.rollouter_actor is not None: + ray.get(self.rollouter_actor.update_rollout_weights.remote(self.current_param_version)) + + def _process_batch_samples(self, batch_samples: list[BatchSample]) -> DataProto: + """处理从队列获取的batch样本""" + if len(batch_samples) == 1: + return batch_samples[0].data + + # 如果有多个batch,需要合并 + all_batches = [sample.data for sample in batch_samples] + return DataProto.concat(all_batches) + + def fit(self): + """主训练循环""" + from omegaconf import OmegaConf + + from verl.utils.tracking import Tracking + + logger = Tracking( + project_name=self.config.trainer.project_name, + experiment_name=self.config.trainer.experiment_name, + default_backend=self.config.trainer.logger, + config=OmegaConf.to_container(self.config, resolve=True), + ) + + self.global_steps = 0 + + # 加载检查点 + self._load_checkpoint() + + # 验证 + if self.val_reward_fn is not None and self.config.trainer.get("val_before_train", True): + val_metrics = self._validate() + if val_metrics: + pprint(f"Initial validation metrics: {val_metrics}") + logger.log(data=val_metrics, step=self.global_steps) + if self.config.trainer.get("val_only", False): + return + + # 进度条 + progress_bar = tqdm(total=self.total_training_steps, initial=self.global_steps, desc="Training Progress") + + self.global_steps += 1 + last_val_metrics = None + + if self.message_queue_client is None: + raise ValueError("MessageQueue client not set. Call set_message_queue_client() first.") + + logger.info("Starting fully async training loop...") + + while self.global_steps <= self.total_training_steps: + do_profile = ( + self.global_steps in self.config.trainer.profile_steps + if self.config.trainer.profile_steps is not None + else False + ) + + if do_profile: + self.actor_wg.start_profile() + if self.use_reference_policy: + self.ref_policy_wg.start_profile() + if self.use_critic: + self.critic_wg.start_profile() + if self.use_rm: + self.rm_wg.start_profile() + + metrics = {} + timing_raw = {} + # is_last_step = self.global_steps >= self.total_training_steps + + with marked_timer("step", timing_raw): + # 从队列获取样本 + with marked_timer("get_batch_from_queue", timing_raw, color="blue"): + min_batch_count = self.config.async_training.get("min_batch_count", 1) + batch_timeout = self.config.async_training.get("batch_timeout", 30.0) + + batch_samples = self.message_queue_client.get_batch( + min_batch_count=min_batch_count, timeout=batch_timeout + ) + + if batch_samples is None: + logger.warning("Timeout waiting for batch samples, continuing...") + continue + + # 处理获取的样本 + batch = self._process_batch_samples(batch_samples) + + # 计算样本的新鲜度 + sample_ages = [self.current_param_version - sample.param_version for sample in batch_samples] + avg_sample_age = np.mean(sample_ages) + max_sample_age = max(sample_ages) + + logger.info( + f"Processing batch with {len(batch_samples)} samples, " + f"avg_age={avg_sample_age:.1f}, max_age={max_sample_age}" + ) + + # 添加响应掩码 + batch.batch["response_mask"] = compute_response_mask(batch) + + # 计算奖励 + with marked_timer("compute_reward", timing_raw, color="yellow"): + if self.reward_fn is not None: + batch, reward_extra_infos_dict = compute_reward( + batch, reward_fn=self.reward_fn, tokenizer=self.tokenizer + ) + elif self.use_rm: + batch, reward_extra_infos_dict = compute_reward_async( + batch, rm_wg=self.rm_wg, tokenizer=self.tokenizer + ) + else: + raise ValueError("No reward function or reward model provided") + + # 计算reference log probabilities + if self.use_reference_policy: + with marked_timer("compute_ref_log_prob", timing_raw, color="green"): + if self.ref_in_actor: + ref_log_prob_output = self.actor_wg.compute_ref_log_prob(batch) + else: + ref_log_prob_output = self.ref_policy_wg.compute_log_prob(batch) + batch = batch.union(ref_log_prob_output) + + # 计算actor log probabilities + with marked_timer("compute_log_prob", timing_raw, color="cyan"): + log_prob_output = self.actor_wg.compute_log_prob(batch) + batch = batch.union(log_prob_output) + + # 应用KL惩罚 + if self.use_reference_policy: + batch = apply_kl_penalty(batch, self.config.algorithm) + + # 计算优势 + if self.use_critic: + with marked_timer("compute_values", timing_raw, color="magenta"): + values_output = self.critic_wg.compute_values(batch) + batch = batch.union(values_output) + + with marked_timer("compute_advantage", timing_raw, color="orange"): + batch = compute_advantage(batch, self.config.algorithm) + + # 更新critic + if self.use_critic: + with marked_timer("update_critic", timing_raw, color="pink"): + critic_output = self.critic_wg.update_critic(batch) + critic_output_metrics = reduce_metrics(critic_output.meta_info["metrics"]) + metrics.update(critic_output_metrics) + + # 更新actor + if self.config.trainer.critic_warmup <= self.global_steps: + with marked_timer("update_actor", timing_raw, color="red"): + batch.meta_info["multi_turn"] = self.config.actor_rollout_ref.rollout.multi_turn.enable + actor_output = self.actor_wg.update_actor(batch) + actor_output_metrics = reduce_metrics(actor_output.meta_info["metrics"]) + metrics.update(actor_output_metrics) + + # 更新参数版本并同步到Rollouter + with marked_timer("sync_params", timing_raw, color="purple"): + self._update_param_version_and_sync() + + # 记录rollout生成 + rollout_data_dir = self.config.trainer.get("rollout_data_dir", None) + if rollout_data_dir: + with marked_timer("dump_rollout_generations", timing_raw, color="green"): + inputs = self.tokenizer.batch_decode(batch.batch["prompts"], skip_special_tokens=True) + outputs = self.tokenizer.batch_decode(batch.batch["responses"], skip_special_tokens=True) + scores = batch.batch["token_level_scores"].sum(-1).cpu().tolist() + self._dump_generations( + inputs=inputs, + outputs=outputs, + scores=scores, + reward_extra_infos_dict=reward_extra_infos_dict, + dump_path=rollout_data_dir, + ) + + # 验证 + if ( + self.val_reward_fn is not None + and self.config.trainer.val_freq is not None + and self.global_steps % self.config.trainer.val_freq == 0 + ): + with marked_timer("validation", timing_raw, color="brown"): + val_metrics = self._validate() + if val_metrics: + pprint(f"Validation metrics at step {self.global_steps}: {val_metrics}") + last_val_metrics = val_metrics + + # 计算性能指标 + timing_metrics = compute_timing_metrics(timing_raw) + throughput_metrics = compute_throughout_metrics(timing_raw, len(batch)) + data_metrics = compute_data_metrics(batch, self.tokenizer) + + # 添加样本新鲜度指标 + freshness_metrics = { + "avg_sample_age": avg_sample_age, + "max_sample_age": max_sample_age, + "processed_samples": self.processed_samples, + "param_version": self.current_param_version, + } + + metrics.update(timing_metrics) + metrics.update(throughput_metrics) + metrics.update(data_metrics) + metrics.update(freshness_metrics) + + if last_val_metrics is not None: + metrics.update(last_val_metrics) + last_val_metrics = None + + # 记录日志 + logger.log(data=metrics, step=self.global_steps) + + # 更新进度条 + progress_bar.update(1) + progress_bar.set_postfix( + { + "reward": f"{metrics.get('reward/mean', 0):.3f}", + "kl": f"{metrics.get('actor/approx_kl', 0):.3f}", + "queue_size": self.message_queue_client.get_queue_size(), + "param_version": self.current_param_version, + } + ) + + # 保存检查点 + if self.config.trainer.save_freq is not None and self.global_steps % self.config.trainer.save_freq == 0: + self._save_checkpoint() + + if do_profile: + self.actor_wg.end_profile() + if self.use_reference_policy: + self.ref_policy_wg.end_profile() + if self.use_critic: + self.critic_wg.end_profile() + if self.use_rm: + self.rm_wg.end_profile() + + self.global_steps += 1 + self.processed_samples += len(batch_samples) + + progress_bar.close() + logger.info(f"Training completed after {self.global_steps} steps") + + # 最终验证 + if self.val_reward_fn is not None: + val_metrics = self._validate() + if val_metrics: + pprint(f"Final validation metrics: {val_metrics}") + logger.log(data=val_metrics, step=self.global_steps) + + # 最终检查点保存 + self._save_checkpoint() + + def get_statistics(self) -> dict: + """获取训练统计信息""" + return { + "global_steps": self.global_steps, + "processed_samples": self.processed_samples, + "stale_samples_processed": self.stale_samples_processed, + "current_param_version": self.current_param_version, + "queue_size": self.message_queue_client.get_queue_size() if self.message_queue_client else 0, + } diff --git a/recipe/fully_async_policy/message_queue.py b/recipe/fully_async_policy/message_queue.py new file mode 100644 index 00000000000..e28346a9ccd --- /dev/null +++ b/recipe/fully_async_policy/message_queue.py @@ -0,0 +1,238 @@ +# Copyright 2025 Meituan Ltd. and/or its affiliates +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import threading +import time +import uuid +from collections import deque +from dataclasses import dataclass +from typing import Any, Optional + +import ray +import zmq +from filelock import FileLock +from omegaconf import DictConfig + +from verl import DataProto + + +@dataclass +class BatchSample: + """单个batch样本,包含参数版本和新鲜度信息""" + + batch_id: str + epoch: int + data: DataProto + param_version: int + timestamp: float + rollout_metadata: dict[str, Any] + + +@ray.remote(num_cpus=1) +class MessageQueue: + """ + 基于ZeroMQ的异步消息队列,用于Rollouter和Trainer之间的通信 + """ + + def __init__(self, config: DictConfig, max_queue_size: int = 1000): + self.config = config + self.max_queue_size = max_queue_size + self.queue = deque(maxlen=max_queue_size) + self.current_param_version = 0 + self.freshness_threshold = config.async_training.get("freshness_threshold", 3) + + # ZeroMQ setup + self.context = zmq.Context() + self.socket = None + self.address = None + self._setup_zmq() + + # Threading for message handling + self.running = True + self.lock = threading.RLock() + self.consumer_waiting = False + self.consumer_condition = threading.Condition(self.lock) + + # Statistics + self.total_produced = 0 + self.total_consumed = 0 + self.dropped_samples = 0 + + def _setup_zmq(self): + """设置ZeroMQ socket""" + with FileLock("/tmp/verl_message_queue.lock"): + # 使用TCP socket + import socket as sock + + with sock.socket() as s: + s.bind(("", 0)) + port = s.getsockname()[1] + + self.address = f"tcp://127.0.0.1:{port}" + self.socket = self.context.socket(zmq.PAIR) + self.socket.bind(self.address) + + def put_batch( + self, epoch: int, batch: DataProto, param_version: int, rollout_metadata: dict[str, Any] = None + ) -> bool: + """ + 放入一个batch样本到队列 + + Args: + epoch: 当前epoch + batch: 样本数据 + param_version: 参数版本号 + rollout_metadata: rollout相关的元数据 + + Returns: + bool: 是否成功放入队列 + """ + with self.lock: + # 检查新鲜度 + staleness = self.current_param_version - param_version + if staleness >= self.freshness_threshold: + self.dropped_samples += 1 + return False + + sample = BatchSample( + batch_id=str(uuid.uuid4()), + epoch=epoch, + data=batch, + param_version=param_version, + timestamp=time.time(), + rollout_metadata=rollout_metadata or {}, + ) + + # 如果队列满了,移除最旧的样本 + if len(self.queue) >= self.max_queue_size: + removed = self.queue.popleft() + self.dropped_samples += 1 + print(f"Queue full, dropped sample {removed.batch_id}") + + self.queue.append(sample) + self.total_produced += 1 + + # 通知等待的消费者 + if self.consumer_waiting: + self.consumer_condition.notify() + + return True + + def get_batch(self, min_batch_count: int = 1, timeout: float = 30.0) -> Optional[list[BatchSample]]: + """ + 从队列获取batch样本 + + Args: + min_batch_count: 最小batch数量 + timeout: 超时时间(秒) + + Returns: + Optional[List[BatchSample]]: 获取的样本列表,如果超时返回None + """ + with self.lock: + start_time = time.time() + + while len(self.queue) < min_batch_count: + if time.time() - start_time > timeout: + return None + + self.consumer_waiting = True + self.consumer_condition.wait(timeout=1.0) + self.consumer_waiting = False + + # 获取指定数量的样本 + batch_count = min(min_batch_count, len(self.queue)) + samples = [] + for _ in range(batch_count): + if self.queue: + samples.append(self.queue.popleft()) + + self.total_consumed += len(samples) + return samples + + def update_param_version(self, version: int): + """更新当前参数版本""" + with self.lock: + self.current_param_version = version + + def get_queue_size(self) -> int: + """获取当前队列长度""" + with self.lock: + return len(self.queue) + + def get_statistics(self) -> dict[str, Any]: + """获取队列统计信息""" + with self.lock: + return { + "queue_size": len(self.queue), + "total_produced": self.total_produced, + "total_consumed": self.total_consumed, + "dropped_samples": self.dropped_samples, + "current_param_version": self.current_param_version, + "freshness_threshold": self.freshness_threshold, + } + + def clear_queue(self): + """清空队列""" + with self.lock: + self.queue.clear() + + def shutdown(self): + """关闭消息队列""" + self.running = False + if self.socket: + self.socket.close() + if self.context: + self.context.term() + + def get_address(self) -> str: + """获取ZeroMQ地址""" + return self.address + + +class MessageQueueClient: + """MessageQueue的客户端,用于与MessageQueue Actor通信""" + + def __init__(self, queue_actor: ray.ActorHandle): + self.queue_actor = queue_actor + + def put_batch( + self, epoch: int, batch: DataProto, param_version: int, rollout_metadata: dict[str, Any] = None + ) -> bool: + """放入batch到队列""" + return ray.get(self.queue_actor.put_batch.remote(epoch, batch, param_version, rollout_metadata)) + + def get_batch(self, min_batch_count: int = 1, timeout: float = 30.0) -> Optional[list[BatchSample]]: + """从队列获取batch""" + return ray.get(self.queue_actor.get_batch.remote(min_batch_count, timeout)) + + def update_param_version(self, version: int): + """更新参数版本""" + ray.get(self.queue_actor.update_param_version.remote(version)) + + def get_queue_size(self) -> int: + """获取队列大小""" + return ray.get(self.queue_actor.get_queue_size.remote()) + + def get_statistics(self) -> dict[str, Any]: + """获取统计信息""" + return ray.get(self.queue_actor.get_statistics.remote()) + + def clear_queue(self): + """清空队列""" + ray.get(self.queue_actor.clear_queue.remote()) + + def shutdown(self): + """关闭队列""" + ray.get(self.queue_actor.shutdown.remote()) diff --git a/recipe/fully_async_policy/param_sync.py b/recipe/fully_async_policy/param_sync.py new file mode 100644 index 00000000000..272f890cbbc --- /dev/null +++ b/recipe/fully_async_policy/param_sync.py @@ -0,0 +1,175 @@ +# Copyright 2025 Meituan Ltd. and/or its affiliates +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import logging + +import ray +from ray.util.collective import collective + +logger = logging.getLogger(__name__) + + +class ParameterSynchronizer: + """ + 参数同步器,负责在actor和rollout之间同步模型参数 + """ + + def __init__(self, config): + self.config = config + self.weights_info = None + self.sync_group_initialized = False + + def initialize_sync_group(self, actor_workers: list, rollout_workers: list): + """ + 初始化参数同步组 + + Args: + actor_workers: actor worker列表 + rollout_workers: rollout worker列表 + """ + logger.info("Initializing parameter synchronization group...") + + try: + # 获取actor的权重信息 + if actor_workers: + self.weights_info = ray.get(actor_workers[0].get_actor_weights_info.remote())[0] + + # 设置rollout的权重信息 + for rollout_worker in rollout_workers: + ray.get(rollout_worker.set_actor_weights_info.remote(self.weights_info)) + + # 创建actor-rollout通信组 + all_workers = actor_workers + rollout_workers + collective.create_collective_group( + all_workers, + len(all_workers), + list(range(0, len(all_workers))), + backend="nccl", + group_name="actor_rollout", + ) + + self.sync_group_initialized = True + logger.info("Parameter synchronization group initialized successfully") + + except Exception as e: + logger.error(f"Failed to initialize sync group: {e}") + raise + + def sync_weights(self, actor_workers: list, rollout_workers: list): + """ + 同步权重从actor到rollout + + Args: + actor_workers: actor worker列表 + rollout_workers: rollout worker列表 + """ + if not self.sync_group_initialized: + raise RuntimeError("Sync group not initialized. Call initialize_sync_group() first.") + + logger.debug("Synchronizing weights from actor to rollout...") + + try: + # 同步权重 + sync_futures = [] + + # Actor端同步 + for actor_worker in actor_workers: + future = actor_worker.sync_rollout_weights.remote() + sync_futures.append(future) + + # Rollout端同步 + for rollout_worker in rollout_workers: + future = rollout_worker.sync_rollout_weights.remote() + sync_futures.append(future) + + # 等待所有同步完成 + ray.get(sync_futures) + + logger.debug("Weight synchronization completed") + + except Exception as e: + logger.error(f"Failed to sync weights: {e}") + raise + + +@ray.remote +class ParameterSyncManager: + """ + Ray Actor形式的参数同步管理器 + """ + + def __init__(self, config): + self.config = config + self.synchronizer = ParameterSynchronizer(config) + self.actor_workers = [] + self.rollout_workers = [] + + def register_workers(self, actor_workers: list, rollout_workers: list): + """注册worker""" + self.actor_workers = actor_workers + self.rollout_workers = rollout_workers + + # 初始化同步组 + self.synchronizer.initialize_sync_group(actor_workers, rollout_workers) + + def sync_parameters(self): + """执行参数同步""" + self.synchronizer.sync_weights(self.actor_workers, self.rollout_workers) + return True + + def get_weights_info(self): + """获取权重信息""" + return self.synchronizer.weights_info + + +class AsyncParameterSynchronizer: + """ + 异步参数同步器,用于完全异步训练工作流 + """ + + def __init__(self, config, actor_wg, rollouter_actor): + """ + Args: + config: 配置 + actor_wg: actor worker group + rollouter_actor: rollouter actor引用 + """ + self.config = config + self.actor_wg = actor_wg + self.rollouter_actor = rollouter_actor + self.current_version = 0 + + def sync_to_rollouter(self, new_version: int): + """ + 将actor参数同步到rollouter + + Args: + new_version: 新的参数版本号 + """ + logger.info(f"Syncing parameters to rollouter, version: {new_version}") + + try: + # 通知rollouter更新参数 + ray.get(self.rollouter_actor.update_rollout_weights.remote(new_version)) + + self.current_version = new_version + logger.info(f"Parameter sync to rollouter completed, version: {new_version}") + + except Exception as e: + logger.error(f"Failed to sync parameters to rollouter: {e}") + raise + + def get_current_version(self) -> int: + """获取当前参数版本""" + return self.current_version diff --git a/recipe/fully_async_policy/rollouter.py b/recipe/fully_async_policy/rollouter.py new file mode 100644 index 00000000000..d98f5e5fdf5 --- /dev/null +++ b/recipe/fully_async_policy/rollouter.py @@ -0,0 +1,414 @@ +# Copyright 2025 Meituan Ltd. and/or its affiliates +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import logging +import threading +import time +import uuid +from typing import Optional + +import numpy as np +import ray +from omegaconf import OmegaConf +from torch.utils.data import Dataset, Sampler + +from recipe.fully_async_policy.message_queue import MessageQueueClient +from verl import DataProto +from verl.single_controller.ray import RayClassWithInitArgs, RayWorkerGroup +from verl.single_controller.ray.base import create_colocated_worker_cls +from verl.trainer.ppo.ray_trainer import ResourcePoolManager, Role, WorkerType +from verl.utils.debug import marked_timer + +logger = logging.getLogger(__name__) + + +class RolloutController: + """控制rollout的暂停和恢复""" + + def __init__(self): + self.is_paused = False + self.pause_event = threading.Event() + self.resume_event = threading.Event() + self.resume_event.set() # 初始状态为可运行 + self.pending_requests = [] + self.lock = threading.RLock() + + def pause(self): + """暂停rollout""" + with self.lock: + if not self.is_paused: + self.is_paused = True + self.resume_event.clear() + self.pause_event.set() + logger.info("Rollout paused") + + def resume(self): + """恢复rollout""" + with self.lock: + if self.is_paused: + self.is_paused = False + self.pause_event.clear() + self.resume_event.set() + logger.info("Rollout resumed") + + def wait_if_paused(self, timeout: float = None): + """如果被暂停则等待恢复""" + if self.is_paused: + self.resume_event.wait(timeout) + + def is_pause_requested(self) -> bool: + """检查是否有暂停请求""" + return self.pause_event.is_set() + + +class Rollouter: + """ + 异步样本生成器,负责持续生成训练样本并放入MessageQueue + """ + + def __init__( + self, + config, + tokenizer, + role_worker_mapping: dict[Role, WorkerType], + resource_pool_manager: ResourcePoolManager, + ray_worker_group_cls: RayWorkerGroup = RayWorkerGroup, + processor=None, + train_dataset: Optional[Dataset] = None, + collate_fn=None, + train_sampler: Optional[Sampler] = None, + device_name="cuda", + ): + self.config = config + self.tokenizer = tokenizer + self.processor = processor + self.role_worker_mapping = role_worker_mapping + self.resource_pool_manager = resource_pool_manager + self.ray_worker_group_cls = ray_worker_group_cls + self.device_name = device_name + + # 数据相关 + self.train_dataset = train_dataset + self.collate_fn = collate_fn + self.train_sampler = train_sampler + + # Rollout控制 + self.rollout_controller = RolloutController() + self.current_param_version = 0 + + # 新鲜度控制 + self.freshness_threshold = config.async_training.get("freshness_threshold", 3) + self.max_staleness_allowed = config.async_training.get("max_staleness_allowed", 5) + + # 统计信息 + self.total_generated_samples = 0 + self.dropped_stale_samples = 0 + self.pause_count = 0 + + # Worker groups + self.rollout_wg = None + self.message_queue_client = None + + # 运行状态 + self.running = False + self.generation_thread = None + + def init_workers(self): + """初始化rollout workers""" + logger.info("Initializing Rollouter workers...") + + self.resource_pool_manager.create_resource_pool() + self.resource_pool_to_cls = {pool: {} for pool in self.resource_pool_manager.resource_pool_dict.values()} + + # 只创建rollout worker + resource_pool = self.resource_pool_manager.get_resource_pool(Role.Rollout) + role_cls = RayClassWithInitArgs( + cls=self.role_worker_mapping[Role.Rollout], + config=self.config.actor_rollout_ref, + role="rollout", + ) + self.resource_pool_to_cls[resource_pool]["rollout"] = role_cls + + # 初始化WorkerGroup + all_wg = {} + wg_kwargs = {} + if OmegaConf.select(self.config.trainer, "ray_wait_register_center_timeout") is not None: + wg_kwargs["ray_wait_register_center_timeout"] = self.config.trainer.ray_wait_register_center_timeout + + for resource_pool, class_dict in self.resource_pool_to_cls.items(): + worker_dict_cls = create_colocated_worker_cls(class_dict=class_dict) + wg_dict = self.ray_worker_group_cls( + resource_pool=resource_pool, + ray_cls_with_init=worker_dict_cls, + device_name=self.device_name, + **wg_kwargs, + ) + spawn_wg = wg_dict.spawn(prefix_set=class_dict.keys()) + all_wg.update(spawn_wg) + + self.rollout_wg = all_wg["rollout"] + self.rollout_wg.init_model() + logger.info("Rollouter workers initialized successfully") + + def set_message_queue_client(self, message_queue_client: MessageQueueClient): + """设置消息队列客户端""" + self.message_queue_client = message_queue_client + + def update_rollout_weights(self, param_version: int): + """ + 更新rollout模型参数 + 这个方法由外部Trainer调用 + """ + logger.info(f"Updating rollout weights to version {param_version}") + + # 暂停rollout + self.rollout_controller.pause() + + try: + # 暂停推理引擎 + ray.get(self.rollout_wg.sleep.remote()) + + # 执行参数同步 + # 这里需要与actor建立同步机制 + if hasattr(self, "param_synchronizer") and self.param_synchronizer: + self.param_synchronizer.sync_weights() + else: + logger.warning("Parameter synchronizer not available, skipping weight sync") + + # 更新参数版本 + self.current_param_version = param_version + + # 恢复推理引擎 + ray.get(self.rollout_wg.wake_up.remote()) + + finally: + # 恢复rollout + self.rollout_controller.resume() + + logger.info(f"Rollout weights updated to version {param_version}") + + def set_parameter_synchronizer(self, param_synchronizer): + """设置参数同步器""" + self.param_synchronizer = param_synchronizer + + def _create_dataloader(self): + """创建数据加载器""" + from torch.utils.data import DataLoader + + return DataLoader( + self.train_dataset, + batch_size=self.config.data.train_batch_size, + sampler=self.train_sampler, + collate_fn=self.collate_fn, + num_workers=self.config.data.get("dataloader_num_workers", 0), + drop_last=True, + ) + + def _create_continuous_iterator(self): + """创建连续的数据迭代器""" + dataloader = self._create_dataloader() + + for epoch in range(self.config.trainer.total_epochs): + for batch_dict in dataloader: + yield epoch, batch_dict + + def _should_pause_generation(self) -> bool: + """ + 判断是否应该暂停生成,基于新鲜度控制 + """ + if self.message_queue_client is None: + return False + + queue_stats = self.message_queue_client.get_statistics() + queue_size = queue_stats["queue_size"] + current_trainer_version = queue_stats["current_param_version"] + + # 计算参数版本差异 + version_diff = self.current_param_version - current_trainer_version + + # 如果版本差异过大,暂停生成 + if version_diff >= self.max_staleness_allowed: + logger.info( + f"Pausing generation due to staleness: rollout_version={self.current_param_version}, " + f"trainer_version={current_trainer_version}, diff={version_diff}" + ) + return True + + # 如果队列太满,也暂停生成 + max_queue_size = self.freshness_threshold * self.config.data.train_batch_size + if queue_size >= max_queue_size: + logger.info(f"Pausing generation due to full queue: size={queue_size}, max={max_queue_size}") + return True + + return False + + def _generate_batch(self, epoch: int, batch_dict: dict) -> Optional[DataProto]: + """生成单个batch的样本""" + try: + batch = DataProto.from_single_dict(batch_dict) + + # 处理batch用于生成 + batch_keys_to_pop = ["input_ids", "attention_mask", "position_ids"] + non_tensor_batch_keys_to_pop = ["raw_prompt_ids"] + + # 处理多模态数据 + if "multi_modal_data" in batch.non_tensor_batch: + non_tensor_batch_keys_to_pop.append("multi_modal_data") + if "raw_prompt" in batch.non_tensor_batch: + non_tensor_batch_keys_to_pop.append("raw_prompt") + if "tools_kwargs" in batch.non_tensor_batch: + non_tensor_batch_keys_to_pop.append("tools_kwargs") + if "interaction_kwargs" in batch.non_tensor_batch: + non_tensor_batch_keys_to_pop.append("interaction_kwargs") + + gen_batch = batch.pop( + batch_keys=batch_keys_to_pop, + non_tensor_batch_keys=non_tensor_batch_keys_to_pop, + ) + + # 重复生成多个响应 + gen_batch = gen_batch.repeat(repeat_times=self.config.actor_rollout_ref.rollout.n, interleave=True) + + # 执行生成 + if self.config.actor_rollout_ref.rollout.mode == "async": + gen_batch_output = ray.get(self.rollout_wg.async_generate_sequences.remote(gen_batch)) + else: + gen_batch_output = ray.get(self.rollout_wg.generate_sequences.remote(gen_batch)) + + # 添加UID + batch.non_tensor_batch["uid"] = np.array([str(uuid.uuid4()) for _ in range(len(batch.batch))], dtype=object) + + # 重复原始batch以对齐生成的响应 + batch = batch.repeat(repeat_times=self.config.actor_rollout_ref.rollout.n, interleave=True) + + # 合并数据 + final_batch = batch.union(gen_batch_output) + + return final_batch + + except Exception as e: + logger.error(f"Error generating batch: {e}") + return None + + def _generation_loop(self): + """主要的生成循环""" + logger.info("Starting generation loop...") + + continuous_iterator = self._create_continuous_iterator() + + for epoch, batch_dict in continuous_iterator: + if not self.running: + break + + # 等待如果被暂停 + self.rollout_controller.wait_if_paused(timeout=1.0) + + if not self.running: + break + + # 检查是否应该暂停生成 + if self._should_pause_generation(): + time.sleep(1.0) # 等待一段时间再检查 + continue + + # 生成样本 + timing_raw = {} + with marked_timer("generate_batch", timing_raw): + generated_batch = self._generate_batch(epoch, batch_dict) + + if generated_batch is not None: + # 放入队列 + rollout_metadata = { + "timing": timing_raw, + "generation_timestamp": time.time(), + } + + success = self.message_queue_client.put_batch( + epoch=epoch, + batch=generated_batch, + param_version=self.current_param_version, + rollout_metadata=rollout_metadata, + ) + + if success: + self.total_generated_samples += 1 + if self.total_generated_samples % 10 == 0: + logger.info( + f"Generated {self.total_generated_samples} batches, " + f"param_version={self.current_param_version}" + ) + else: + self.dropped_stale_samples += 1 + logger.warning(f"Dropped stale sample, total dropped: {self.dropped_stale_samples}") + + logger.info("Generation loop finished") + + def fit(self): + """开始异步生成样本""" + logger.info("Starting Rollouter...") + + if self.message_queue_client is None: + raise ValueError("MessageQueue client not set. Call set_message_queue_client() first.") + + self.running = True + + # 在单独的线程中运行生成循环 + self.generation_thread = threading.Thread(target=self._generation_loop, daemon=True) + self.generation_thread.start() + + try: + # 主线程保持运行,处理控制信号 + while self.running: + time.sleep(1.0) + + # 定期打印统计信息 + if self.total_generated_samples > 0 and self.total_generated_samples % 100 == 0: + queue_stats = self.message_queue_client.get_statistics() + logger.info( + f"Rollouter stats - Generated: {self.total_generated_samples}, " + f"Dropped: {self.dropped_stale_samples}, " + f"Queue size: {queue_stats['queue_size']}, " + f"Param version: {self.current_param_version}" + ) + + except KeyboardInterrupt: + logger.info("Received interrupt signal, shutting down...") + finally: + self.shutdown() + + def shutdown(self): + """关闭Rollouter""" + logger.info("Shutting down Rollouter...") + + self.running = False + + # 恢复可能被暂停的生成线程 + self.rollout_controller.resume() + + # 等待生成线程结束 + if self.generation_thread and self.generation_thread.is_alive(): + self.generation_thread.join(timeout=5.0) + + logger.info("Rollouter shutdown complete") + + def get_statistics(self) -> dict: + """获取统计信息""" + return { + "total_generated_samples": self.total_generated_samples, + "dropped_stale_samples": self.dropped_stale_samples, + "current_param_version": self.current_param_version, + "pause_count": self.pause_count, + "is_running": self.running, + "is_paused": self.rollout_controller.is_paused, + } diff --git a/recipe/fully_async_policy/run_fully_async_example.sh b/recipe/fully_async_policy/run_fully_async_example.sh new file mode 100644 index 00000000000..d58e4ecc771 --- /dev/null +++ b/recipe/fully_async_policy/run_fully_async_example.sh @@ -0,0 +1,149 @@ +#!/bin/bash +# Copyright 2025 Meituan Ltd. and/or its affiliates +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +set -x + +# 实验配置 +project_name='FullyAsyncPPO' +exp_name='async-qwen2.5-7b-test' + +# 模型和数据路径 +MODEL_PATH=${MODEL_PATH:-"Qwen/Qwen2.5-7B-Instruct"} +TRAIN_FILE=${TRAIN_FILE:-"~/data/train.parquet"} +VAL_FILE=${VAL_FILE:-"~/data/val.parquet"} + +# 硬件配置 +NNODES=${NNODES:-1} +NGPUS_PER_NODE=${NGPUS_PER_NODE:-8} + +# 异步训练资源分配 +n_gpus_rollout=3 # rollout专用GPU数量 +n_gpus_training=$((NGPUS_PER_NODE - n_gpus_rollout)) # 训练GPU数量 + +echo "===================================" +echo "完全异步PPO训练启动" +echo "===================================" +echo "模型路径: $MODEL_PATH" +echo "训练数据: $TRAIN_FILE" +echo "验证数据: $VAL_FILE" +echo "节点数: $NNODES" +echo "每节点GPU数: $NGPUS_PER_NODE" +echo "Rollout GPU数: $n_gpus_rollout" +echo "训练GPU数: $n_gpus_training" +echo "===================================" + +# 算法参数 +temperature=1.0 +top_p=1.0 +top_k=-1 + +# 序列长度 +max_prompt_length=1024 +max_response_length=1024 + +# 异步训练参数 +freshness_threshold=3 +max_staleness_allowed=5 +max_queue_size=1000 +min_batch_count=1 +batch_timeout=30.0 + +# 训练参数 +train_batch_size=128 +total_training_steps=1000 +save_freq=100 +val_freq=50 + +# 设置环境变量 +export NCCL_DEBUG=WARN +export VLLM_USE_V1=1 +export VERL_LOGGING_LEVEL=INFO + +# 启动训练 +python -m recipe.one_step_off_policy.fully_async_main \ + trainer.project_name="$project_name" \ + trainer.experiment_name="$exp_name" \ + trainer.device=cuda \ + trainer.nnodes=$NNODES \ + trainer.n_gpus_per_node=$NGPUS_PER_NODE \ + data.train_files="$TRAIN_FILE" \ + data.val_files="$VAL_FILE" \ + data.train_batch_size=$train_batch_size \ + data.max_prompt_length=$max_prompt_length \ + data.max_response_length=$max_response_length \ + data.train_files="$TRAIN_FILE" \ + data.val_files="$VAL_FILE" \ + data.train_batch_size=$train_batch_size \ + data.max_prompt_length=$max_prompt_length \ + data.max_response_length=$max_response_length \ + \ + # 模型配置 + actor_rollout_ref.model.path="$MODEL_PATH" \ + actor_rollout_ref.model.lora_rank=64 \ + actor_rollout_ref.model.lora_alpha=128 \ + \ + # Rollout配置 + actor_rollout_ref.rollout.mode=async \ + actor_rollout_ref.rollout.n_gpus=$n_gpus_rollout \ + actor_rollout_ref.rollout.name=vllm \ + actor_rollout_ref.rollout.temperature=$temperature \ + actor_rollout_ref.rollout.top_k=$top_k \ + actor_rollout_ref.rollout.top_p=$top_p \ + actor_rollout_ref.rollout.tensor_model_parallel_size=1 \ + actor_rollout_ref.rollout.gpu_memory_utilization=0.6 \ + actor_rollout_ref.rollout.max_num_batched_tokens=8192 \ + actor_rollout_ref.rollout.free_cache_engine=true \ + actor_rollout_ref.rollout.enforce_eager=true \ + \ + # Actor配置 + actor_rollout_ref.actor.optim.lr=1e-6 \ + actor_rollout_ref.actor.ppo_mini_batch_size=32 \ + actor_rollout_ref.actor.use_dynamic_bsz=true \ + actor_rollout_ref.actor.fsdp_config.param_offload=false \ + actor_rollout_ref.actor.fsdp_config.optimizer_offload=false \ + \ + # Critic配置 + critic.model.path="$MODEL_PATH" \ + critic.optim.lr=1e-5 \ + critic.fsdp_config.param_offload=false \ + \ + # 异步训练配置 + async_training.freshness_threshold=$freshness_threshold \ + async_training.max_staleness_allowed=$max_staleness_allowed \ + async_training.max_queue_size=$max_queue_size \ + async_training.min_batch_count=$min_batch_count \ + async_training.batch_timeout=$batch_timeout \ + \ + # 训练配置 + trainer.total_training_steps=$total_training_steps \ + trainer.save_freq=$save_freq \ + trainer.val_freq=$val_freq \ + trainer.critic_warmup=0 \ + \ + # 算法配置 + algorithm.adv_estimator=gae \ + algorithm.cliprange=0.2 \ + algorithm.vf_coeff=0.1 \ + algorithm.entropy_coeff=0.01 \ + algorithm.kl_coeff=0.1 \ + \ + # 日志配置 + trainer.logger='["console", "wandb"]' \ + trainer.val_before_train=false + +echo "===================================" +echo "完全异步PPO训练完成" +echo "===================================" + diff --git a/recipe/fully_async_policy/test_fully_async.py b/recipe/fully_async_policy/test_fully_async.py new file mode 100644 index 00000000000..b2f7f866fd7 --- /dev/null +++ b/recipe/fully_async_policy/test_fully_async.py @@ -0,0 +1,197 @@ +# Copyright 2025 Meituan Ltd. and/or its affiliates +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +""" +测试完全异步训练工作流的组件 +""" + +import logging +import unittest +from unittest.mock import Mock + +import ray +from omegaconf import OmegaConf + +from recipe.fully_async_policy.message_queue import MessageQueue, MessageQueueClient +from verl import DataProto + +logging.basicConfig(level=logging.INFO) +logger = logging.getLogger(__name__) + + +class TestMessageQueue(unittest.TestCase): + """测试MessageQueue组件""" + + def setUp(self): + """设置测试环境""" + if not ray.is_initialized(): + ray.init(local_mode=True) + + config = OmegaConf.create( + { + "async_training": { + "freshness_threshold": 3, + "max_staleness_allowed": 5, + } + } + ) + + self.message_queue = MessageQueue.remote(config, max_queue_size=100) + self.client = MessageQueueClient(self.message_queue) + + def tearDown(self): + """清理测试环境""" + ray.get(self.message_queue.shutdown.remote()) + if ray.is_initialized(): + ray.shutdown() + + def test_basic_put_get(self): + """测试基本的put和get操作""" + # 创建mock数据 + mock_batch = Mock(spec=DataProto) + + # 放入样本 + success = self.client.put_batch(epoch=0, batch=mock_batch, param_version=1, rollout_metadata={"test": "data"}) + self.assertTrue(success) + + # 获取样本 + samples = self.client.get_batch(min_batch_count=1, timeout=5.0) + self.assertIsNotNone(samples) + self.assertEqual(len(samples), 1) + self.assertEqual(samples[0].epoch, 0) + self.assertEqual(samples[0].param_version, 1) + + def test_freshness_control(self): + """测试新鲜度控制""" + mock_batch = Mock(spec=DataProto) + + # 更新参数版本 + self.client.update_param_version(10) + + # 尝试放入过期样本 + success = self.client.put_batch( + epoch=0, + batch=mock_batch, + param_version=5, # 版本差异为5,超过阈值3 + rollout_metadata={}, + ) + self.assertFalse(success) # 应该被拒绝 + + def test_queue_statistics(self): + """测试队列统计信息""" + stats = self.client.get_statistics() + self.assertIn("queue_size", stats) + self.assertIn("total_produced", stats) + self.assertIn("total_consumed", stats) + self.assertIn("dropped_samples", stats) + + +class TestRollouterComponents(unittest.TestCase): + """测试Rollouter相关组件""" + + def setUp(self): + """设置测试环境""" + from .rollouter import RolloutController + + self.controller = RolloutController() + + def test_rollout_controller(self): + """测试rollout控制器""" + # 初始状态应该是运行的 + self.assertFalse(self.controller.is_paused) + + # 测试暂停 + self.controller.pause() + self.assertTrue(self.controller.is_paused) + + # 测试恢复 + self.controller.resume() + self.assertFalse(self.controller.is_paused) + + +class TestParameterSync(unittest.TestCase): + """测试参数同步组件""" + + def test_async_parameter_synchronizer(self): + """测试异步参数同步器""" + from recipe.fully_async_policy.param_sync import AsyncParameterSynchronizer + + config = OmegaConf.create({}) + mock_actor_wg = Mock() + mock_rollouter_actor = Mock() + + sync = AsyncParameterSynchronizer(config, mock_actor_wg, mock_rollouter_actor) + + self.assertEqual(sync.get_current_version(), 0) + + +def test_integration(): + """集成测试""" + logger.info("Starting integration test...") + + if not ray.is_initialized(): + ray.init(local_mode=True) + + try: + # 测试MessageQueue和客户端的集成 + config = OmegaConf.create( + { + "async_training": { + "freshness_threshold": 3, + "max_staleness_allowed": 5, + } + } + ) + + message_queue = MessageQueue.remote(config, max_queue_size=10) + client = MessageQueueClient(message_queue) + + # 模拟生产者-消费者场景 + mock_batch = Mock(spec=DataProto) + + # 生产样本 + for i in range(5): + success = client.put_batch(epoch=i, batch=mock_batch, param_version=i, rollout_metadata={"batch_id": i}) + assert success, f"Failed to put batch {i}" + + # 消费样本 + samples = client.get_batch(min_batch_count=3, timeout=10.0) + assert samples is not None, "Failed to get samples" + assert len(samples) == 3, f"Expected 3 samples, got {len(samples)}" + + # 检查统计信息 + stats = client.get_statistics() + assert stats["total_produced"] == 5 + assert stats["total_consumed"] == 3 + + logger.info("Integration test passed!") + + # 清理 + ray.get(message_queue.shutdown.remote()) + + finally: + if ray.is_initialized(): + ray.shutdown() + + +if __name__ == "__main__": + # 运行单元测试 + unittest.main(argv=[""], exit=False, verbosity=2) + + # 运行集成测试 + test_integration() + + print("\n" + "=" * 50) + print("所有测试完成!") + print("=" * 50) diff --git a/tests/special_sanity/check_license.py b/tests/special_sanity/check_license.py index 1a2073e6b02..7582c6c18f4 100644 --- a/tests/special_sanity/check_license.py +++ b/tests/special_sanity/check_license.py @@ -23,6 +23,7 @@ license_head_modelbest = "Copyright 2025 ModelBest Inc. and/or its affiliates" license_head_amazon = "Copyright 2025 Amazon.com Inc and/or its affiliates" license_head_facebook = "Copyright (c) 2016- Facebook, Inc" +license_head_meituan = "Copyright 2025 Meituan Ltd. and/or its affiliates" license_headers = [ license_head_bytedance, license_head_bytedance_25, @@ -32,6 +33,7 @@ license_head_modelbest, license_head_amazon, license_head_facebook, + license_head_meituan, ] @@ -54,4 +56,4 @@ if lh in file_content: has_license = True break - assert has_license, f"file {path_in_str} does not contain license" + assert has_license, f"file {path_in_str} does not contain license \n {file_content}" From eb7990380929f73a8121ffa5150466db9eff018a Mon Sep 17 00:00:00 2001 From: ArronHZG Date: Thu, 24 Jul 2025 16:33:19 +0800 Subject: [PATCH 002/182] init async training pipline --- tests/special_sanity/check_license.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/special_sanity/check_license.py b/tests/special_sanity/check_license.py index 7582c6c18f4..d759a417ff4 100644 --- a/tests/special_sanity/check_license.py +++ b/tests/special_sanity/check_license.py @@ -56,4 +56,4 @@ if lh in file_content: has_license = True break - assert has_license, f"file {path_in_str} does not contain license \n {file_content}" + assert has_license, f"file {path_in_str} does not contain license" From 0459298aec438ab75a381b84774b1b643443f0d1 Mon Sep 17 00:00:00 2001 From: ArronHZG Date: Fri, 25 Jul 2025 19:59:34 +0800 Subject: [PATCH 003/182] update code --- recipe/fully_async_policy/RollouterActor.py | 75 ++++ recipe/fully_async_policy/fully_async_main.py | 263 ++++++------ .../fully_async_policy/fully_async_trainer.py | 7 +- recipe/fully_async_policy/message_queue.py | 15 +- recipe/fully_async_policy/rollouter.py | 7 +- recipe/fully_async_policy/test_mq.py | 374 ++++++++++++++++++ verl/trainer/main_ppo.py | 9 +- 7 files changed, 620 insertions(+), 130 deletions(-) create mode 100644 recipe/fully_async_policy/RollouterActor.py create mode 100644 recipe/fully_async_policy/test_mq.py diff --git a/recipe/fully_async_policy/RollouterActor.py b/recipe/fully_async_policy/RollouterActor.py new file mode 100644 index 00000000000..fb5212b577a --- /dev/null +++ b/recipe/fully_async_policy/RollouterActor.py @@ -0,0 +1,75 @@ +# Copyright 2025 Meituan Ltd. and/or its affiliates +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import ray + +from recipe.fully_async_policy.rollouter import Rollouter + + +@ray.remote +class RollouterActor: + """Rollouter的Ray Actor包装器""" + + def __init__( + self, + config, + tokenizer, + role_worker_mapping, + resource_pool_manager, + ray_worker_group_cls, + processor=None, + train_dataset=None, + collate_fn=None, + train_sampler=None, + device_name="cuda", + ): + self.rollouter = Rollouter( + config=config, + tokenizer=tokenizer, + role_worker_mapping=role_worker_mapping, + resource_pool_manager=resource_pool_manager, + ray_worker_group_cls=ray_worker_group_cls, + processor=processor, + train_dataset=train_dataset, + collate_fn=collate_fn, + train_sampler=train_sampler, + device_name=device_name, + ) + + def init_workers(self): + """初始化worker""" + return self.rollouter.init_workers() + + def set_message_queue_client(self, message_queue_client): + """设置消息队列客户端""" + return self.rollouter.set_message_queue_client(message_queue_client) + + def set_parameter_synchronizer(self, param_synchronizer): + """设置参数同步器""" + return self.rollouter.set_parameter_synchronizer(param_synchronizer) + + def update_rollout_weights(self, param_version: int): + """更新rollout权重""" + return self.rollouter.update_rollout_weights(param_version) + + def fit(self): + """开始生成循环""" + return self.rollouter.fit() + + def shutdown(self): + """关闭rollouter""" + return self.rollouter.shutdown() + + def get_statistics(self): + """获取统计信息""" + return self.rollouter.get_statistics() diff --git a/recipe/fully_async_policy/fully_async_main.py b/recipe/fully_async_policy/fully_async_main.py index 3bab5d91eb1..f0689a5d28c 100644 --- a/recipe/fully_async_policy/fully_async_main.py +++ b/recipe/fully_async_policy/fully_async_main.py @@ -13,19 +13,32 @@ # limitations under the License. import logging -import os import threading import time +import os +import socket + +import hydra +import ray +from omegaconf import OmegaConf + +from recipe.fully_async_policy.RollouterActor import RollouterActor +from verl.experimental.dataset.sampler import AbstractSampler +from verl.trainer.constants_ppo import get_ppo_ray_runtime_env +from verl.trainer.ppo.ray_trainer import RayPPOTrainer +from verl.trainer.ppo.reward import load_reward_manager +from verl.utils.device import is_cuda_available +from verl.utils.import_utils import load_extern_type import hydra import ray from recipe.fully_async_policy.message_queue import MessageQueue, MessageQueueClient -from recipe.fully_async_policy.rollouter import Rollouter -from verl.trainer.main_ppo import create_rl_dataset, create_rl_sampler +from verl.trainer.main_ppo import create_rl_dataset, create_rl_sampler, run_ppo from verl.trainer.ppo.reward import load_reward_manager +from verl.utils.dataset.rl_dataset import collate_fn -from .fully_async_trainer import FullyAsyncTrainer +from fully_async_trainer import FullyAsyncTrainer logger = logging.getLogger(__name__) @@ -35,110 +48,157 @@ def setup_logging(): logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(name)s - %(levelname)s - %(message)s") -@ray.remote -class RollouterActor: - """Rollouter的Ray Actor包装器""" - - def __init__( - self, - config, - tokenizer, - role_worker_mapping, - resource_pool_manager, - ray_worker_group_cls, - processor=None, - train_dataset=None, - collate_fn=None, - train_sampler=None, - device_name="cuda", - ): - self.rollouter = Rollouter( - config=config, - tokenizer=tokenizer, - role_worker_mapping=role_worker_mapping, - resource_pool_manager=resource_pool_manager, - ray_worker_group_cls=ray_worker_group_cls, - processor=processor, - train_dataset=train_dataset, - collate_fn=collate_fn, - train_sampler=train_sampler, - device_name=device_name, - ) - - def init_workers(self): - """初始化worker""" - return self.rollouter.init_workers() - - def set_message_queue_client(self, message_queue_client): - """设置消息队列客户端""" - return self.rollouter.set_message_queue_client(message_queue_client) - - def set_parameter_synchronizer(self, param_synchronizer): - """设置参数同步器""" - return self.rollouter.set_parameter_synchronizer(param_synchronizer) +@ray.remote(num_cpus=1) # please make sure main_task is not scheduled on head +class FullyAsyncTaskRunner: + """Ray remote class for executing distributed PPO training tasks. - def update_rollout_weights(self, param_version: int): - """更新rollout权重""" - return self.rollouter.update_rollout_weights(param_version) + This class encapsulates the main training logic and runs as a Ray remote actor + to enable distributed execution across multiple nodes and GPUs. + """ - def fit(self): - """开始生成循环""" - return self.rollouter.fit() + def run(self, config): + """运行完全异步的PPO训练""" + setup_logging() - def shutdown(self): - """关闭rollouter""" - return self.rollouter.shutdown() + logger.info("Starting fully async PPO training...") + # 创建数据集和采样器 + logger.info("Creating dataset and sampler...") + from verl.utils import hf_processor, hf_tokenizer - def get_statistics(self): - """获取统计信息""" - return self.rollouter.get_statistics() + # Print the initial configuration. `resolve=True` will evaluate symbolic values. + from pprint import pprint + from omegaconf import OmegaConf -def run_fully_async_ppo(config): - """运行完全异步的PPO训练""" - setup_logging() + from verl.utils.fs import copy_to_local - logger.info("Starting fully async PPO training...") + print(f"TaskRunner hostname: {socket.gethostname()}, PID: {os.getpid()}") + pprint(OmegaConf.to_container(config, resolve=True)) + OmegaConf.resolve(config) - # 初始化Ray - if not ray.is_initialized(): - ray.init( - address=os.environ.get("RAY_ADDRESS", None), - runtime_env={"env_vars": {"NCCL_DEBUG": "WARN", "VLLM_USE_V1": "1"}}, + # Download the checkpoint from HDFS to the local machine. + # `use_shm` determines whether to use shared memory, which could lead to faster model loading if turned on + local_path = copy_to_local( + config.actor_rollout_ref.model.path, use_shm=config.actor_rollout_ref.model.get("use_shm", False) ) - try: - # 创建数据集和采样器 - logger.info("Creating dataset and sampler...") + # Instantiate the tokenizer and processor. from verl.utils import hf_processor, hf_tokenizer - tokenizer = hf_tokenizer(config.actor_rollout_ref.model.path) - processor = hf_processor(config.actor_rollout_ref.model.path) - - train_dataset, val_dataset = create_rl_dataset(config, tokenizer, processor) - train_sampler = create_rl_sampler(config, train_dataset) + trust_remote_code = config.data.get("trust_remote_code", False) + tokenizer = hf_tokenizer(local_path, trust_remote_code=trust_remote_code) + # Used for multimodal LLM, could be None + processor = hf_processor(local_path, trust_remote_code=trust_remote_code, use_fast=True) + + # Define worker classes based on the actor strategy. + if config.actor_rollout_ref.actor.strategy == "fsdp2": + assert config.actor_rollout_ref.actor.strategy == config.critic.strategy + from verl.single_controller.ray import RayWorkerGroup + + from recipe.one_step_off_policy.fsdp_workers import ( + ActorRolloutRefWorker, + AsyncActorRolloutRefWorker, + CriticWorker, + RolloutWorker, + ) + + actor_rollout_cls = ( + AsyncActorRolloutRefWorker + if config.actor_rollout_ref.rollout.mode == "async" + else ActorRolloutRefWorker + ) + ray_worker_group_cls = RayWorkerGroup + + elif config.actor_rollout_ref.actor.strategy == "megatron": + assert config.actor_rollout_ref.actor.strategy == config.critic.strategy + from verl.single_controller.ray.megatron import NVMegatronRayWorkerGroup + + from recipe.one_step_off_policy.megatron_workers import ( + ActorRolloutRefWorker, + AsyncActorRolloutRefWorker, + CriticWorker, + RolloutWorker, + ) + + actor_rollout_cls = ( + AsyncActorRolloutRefWorker + if config.actor_rollout_ref.rollout.mode == "async" + else ActorRolloutRefWorker + ) + ray_worker_group_cls = NVMegatronRayWorkerGroup + + else: + raise NotImplementedError + + from recipe.one_step_off_policy.ray_trainer import ResourcePoolManager, Role + + role_worker_mapping = { + Role.Actor: ray.remote(actor_rollout_cls), + Role.Rollout: ray.remote(RolloutWorker), + Role.Critic: ray.remote(CriticWorker), + } - # 创建collate function - from verl.trainer.ppo.ray_trainer import default_collate_fn + global_pool_id = "actor_pool" + rollout_pool_id = "rollout_pool" - collate_fn = default_collate_fn + assert config.trainer.n_gpus_per_node > 0, "config.trainer.n_gpus_per_node must be greater than 0" + assert config.trainer.nnodes > 0, "config.trainer.nnodes must be greater than 0" + assert config.rollout.n_gpus_per_node > 0, "config.rollout.n_gpus_per_node must be greater than 0" + assert config.rollout.nnodes > 0, "config.rollout.nnodes must be greater than 0" - # 创建奖励函数 - reward_fn, val_reward_fn = load_reward_manager(config, tokenizer) + actor_pool = [config.trainer.n_gpus_per_node] * config.trainer.nnodes + rollout_pool = [config.rollout.n_gpus_per_node] * config.rollout.nnodes - # 创建资源池管理器和worker映射 - from verl.single_controller.ray import RayWorkerGroup - from verl.trainer.ppo.ray_trainer import ( - Role, - create_resource_pool_manager, - create_role_worker_mapping, + resource_pool_spec = { + "actor_pool": actor_pool, + "rollout_pool": rollout_pool, + } + mapping = { + Role.Actor: global_pool_id, + Role.Rollout: rollout_pool_id, + Role.Critic: global_pool_id, + } + print(f"resource_pool_spec: {resource_pool_spec}") + # We should adopt a multi-source reward function here: + # - for rule-based rm, we directly call a reward score + # - for model-based rm, we call a model + # - for code related prompt, we send to a sandbox if there are test cases + # finally, we combine all the rewards together + # The reward type depends on the tag of the data + if config.reward_model.enable: + if config.reward_model.strategy == "fsdp2": + from verl.workers.fsdp_workers import RewardModelWorker + elif config.reward_model.strategy == "megatron": + from verl.workers.megatron_workers import RewardModelWorker + else: + raise NotImplementedError + role_worker_mapping[Role.RewardModel] = ray.remote(RewardModelWorker) + mapping[Role.RewardModel] = global_pool_id + + # Add a reference policy worker if KL loss or KL reward is used. + if config.algorithm.use_kl_in_reward or config.actor_rollout_ref.actor.use_kl_loss: + role_worker_mapping[Role.RefPolicy] = ray.remote(ActorRolloutRefWorker) + mapping[Role.RefPolicy] = global_pool_id + + # Load the reward manager for training and validation. + reward_fn = load_reward_manager( + config, tokenizer, num_examine=0, **config.reward_model.get("reward_kwargs", {}) ) + val_reward_fn = load_reward_manager( + config, tokenizer, num_examine=1, **config.reward_model.get("reward_kwargs", {}) + ) + resource_pool_manager = ResourcePoolManager(resource_pool_spec=resource_pool_spec, mapping=mapping) + + from verl.utils.dataset.rl_dataset import collate_fn - # resource_pool_manager = create_resource_pool_manager(config) - role_worker_mapping = create_role_worker_mapping(config) + # Create training and validation datasets. + train_dataset = create_rl_dataset(config.data.train_files, config.data, tokenizer, processor) + val_dataset = create_rl_dataset(config.data.val_files, config.data, tokenizer, processor) + train_sampler = create_rl_sampler(config.data, train_dataset) # 1. 创建MessageQueue logger.info("Creating MessageQueue...") + # todo max_queue_size auto compute max_queue_size = config.async_training.get("max_queue_size", 1000) message_queue = MessageQueue.remote(config, max_queue_size) message_queue_client = MessageQueueClient(message_queue) @@ -168,6 +228,9 @@ def run_fully_async_ppo(config): role: worker_cls for role, worker_cls in role_worker_mapping.items() if role != Role.Rollout } + # 创建奖励函数 + reward_fn, val_reward_fn = load_reward_manager(config, tokenizer) + trainer = FullyAsyncTrainer( config=config, tokenizer=tokenizer, @@ -207,37 +270,15 @@ def run_rollouter(): rollouter_thread = threading.Thread(target=run_rollouter, daemon=True) rollouter_thread.start() - # 等待一下让Rollouter启动 - time.sleep(5) - # 6. 启动Trainer(主线程) logger.info("Starting FullyAsyncTrainer...") trainer.fit() - # 7. 关闭 - logger.info("Shutting down...") - ray.get(rollouter_actor.shutdown.remote()) - - # 等待Rollouter线程结束 - rollouter_thread.join(timeout=10) - - # 关闭MessageQueue - ray.get(message_queue.shutdown.remote()) - - logger.info("Fully async PPO training completed successfully!") - - except Exception as e: - logger.error(f"Error in fully async PPO training: {e}") - raise - finally: - if ray.is_initialized(): - ray.shutdown() - -@hydra.main(config_path="../one_step_off_policy/config", config_name="fully_async_ppo_trainer", version_base=None) +@hydra.main(config_path="config", config_name="fully_async_ppo_trainer", version_base=None) def main(config): """主入口函数""" - run_fully_async_ppo(config) + run_ppo(config, FullyAsyncTaskRunner) if __name__ == "__main__": diff --git a/recipe/fully_async_policy/fully_async_trainer.py b/recipe/fully_async_policy/fully_async_trainer.py index 192d33817a6..2487387b163 100644 --- a/recipe/fully_async_policy/fully_async_trainer.py +++ b/recipe/fully_async_policy/fully_async_trainer.py @@ -14,7 +14,6 @@ import logging from pprint import pprint -from typing import Optional import numpy as np import ray @@ -62,10 +61,10 @@ def __init__( processor=None, reward_fn=None, val_reward_fn=None, - train_dataset: Optional[Dataset] = None, - val_dataset: Optional[Dataset] = None, + train_dataset: Dataset | None = None, + val_dataset: Dataset | None = None, collate_fn=None, - train_sampler: Optional[Sampler] = None, + train_sampler: Sampler | None = None, device_name="cuda", ): self.config = config diff --git a/recipe/fully_async_policy/message_queue.py b/recipe/fully_async_policy/message_queue.py index e28346a9ccd..dd9b5c5e8a9 100644 --- a/recipe/fully_async_policy/message_queue.py +++ b/recipe/fully_async_policy/message_queue.py @@ -12,12 +12,11 @@ # See the License for the specific language governing permissions and # limitations under the License. -import threading -import time +import threadingimport time import uuid from collections import deque from dataclasses import dataclass -from typing import Any, Optional +from typing import Any import ray import zmq @@ -39,7 +38,7 @@ class BatchSample: rollout_metadata: dict[str, Any] -@ray.remote(num_cpus=1) +@ray.remote(num_cpus=24) class MessageQueue: """ 基于ZeroMQ的异步消息队列,用于Rollouter和Trainer之间的通信 @@ -84,7 +83,7 @@ def _setup_zmq(self): self.socket.bind(self.address) def put_batch( - self, epoch: int, batch: DataProto, param_version: int, rollout_metadata: dict[str, Any] = None + self, epoch: int, batch: DataProto, param_version: int, rollout_metadata: dict[str, Any] = None ) -> bool: """ 放入一个batch样本到队列 @@ -129,7 +128,7 @@ def put_batch( return True - def get_batch(self, min_batch_count: int = 1, timeout: float = 30.0) -> Optional[list[BatchSample]]: + def get_batch(self, min_batch_count: int = 1, timeout: float = 30.0) -> list[BatchSample] | None: """ 从队列获取batch样本 @@ -208,12 +207,12 @@ def __init__(self, queue_actor: ray.ActorHandle): self.queue_actor = queue_actor def put_batch( - self, epoch: int, batch: DataProto, param_version: int, rollout_metadata: dict[str, Any] = None + self, epoch: int, batch: DataProto, param_version: int, rollout_metadata: dict[str, Any] = None ) -> bool: """放入batch到队列""" return ray.get(self.queue_actor.put_batch.remote(epoch, batch, param_version, rollout_metadata)) - def get_batch(self, min_batch_count: int = 1, timeout: float = 30.0) -> Optional[list[BatchSample]]: + def get_batch(self, min_batch_count: int = 1, timeout: float = 30.0) -> list[BatchSample] | None: """从队列获取batch""" return ray.get(self.queue_actor.get_batch.remote(min_batch_count, timeout)) diff --git a/recipe/fully_async_policy/rollouter.py b/recipe/fully_async_policy/rollouter.py index d98f5e5fdf5..ac43b6e3dbf 100644 --- a/recipe/fully_async_policy/rollouter.py +++ b/recipe/fully_async_policy/rollouter.py @@ -16,7 +16,6 @@ import threading import time import uuid -from typing import Optional import numpy as np import ray @@ -85,9 +84,9 @@ def __init__( resource_pool_manager: ResourcePoolManager, ray_worker_group_cls: RayWorkerGroup = RayWorkerGroup, processor=None, - train_dataset: Optional[Dataset] = None, + train_dataset: Dataset | None = None, collate_fn=None, - train_sampler: Optional[Sampler] = None, + train_sampler: Sampler | None = None, device_name="cuda", ): self.config = config @@ -253,7 +252,7 @@ def _should_pause_generation(self) -> bool: return False - def _generate_batch(self, epoch: int, batch_dict: dict) -> Optional[DataProto]: + def _generate_batch(self, epoch: int, batch_dict: dict) -> DataProto | None: """生成单个batch的样本""" try: batch = DataProto.from_single_dict(batch_dict) diff --git a/recipe/fully_async_policy/test_mq.py b/recipe/fully_async_policy/test_mq.py new file mode 100644 index 00000000000..a8aaa8add5f --- /dev/null +++ b/recipe/fully_async_policy/test_mq.py @@ -0,0 +1,374 @@ +# Copyright 2025 Meituan Ltd. and/or its affiliates +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import pytest +import time +import threading +from unittest.mock import Mock, patch, MagicMock +from omegaconf import DictConfig +import ray + +from message_queue import BatchSample, MessageQueue, MessageQueueClient + + +@pytest.fixture +def mock_data_proto(): + """Mock DataProto对象""" + return Mock() + + +@pytest.fixture +def basic_config(): + """基础配置""" + return DictConfig({ + 'async_training': { + 'freshness_threshold': 3 + } + }) + + +@pytest.fixture +def queue_config(): + """队列配置""" + return DictConfig({ + 'async_training': { + 'freshness_threshold': 2 + } + }) + + +class TestBatchSample: + """测试BatchSample数据类""" + + def test_batch_sample_creation(self, mock_data_proto): + """测试BatchSample创建""" + sample = BatchSample( + batch_id="test-123", + epoch=1, + data=mock_data_proto, + param_version=5, + timestamp=1234567890.0, + rollout_metadata={"key": "value"} + ) + + assert sample.batch_id == "test-123" + assert sample.epoch == 1 + assert sample.data == mock_data_proto + assert sample.param_version == 5 + assert sample.timestamp == 1234567890.0 + assert sample.rollout_metadata == {"key": "value"} + + +class TestMessageQueue: + """测试MessageQueue类(需要在非Ray环境下测试内部逻辑)""" + + @patch('message_queue.zmq.Context') + @patch('message_queue.FileLock') + @patch('socket.socket') + def test_message_queue_init(self, mock_socket, mock_filelock, mock_zmq_context, basic_config): + """测试MessageQueue初始化""" + # Mock socket + mock_sock_instance = Mock() + mock_sock_instance.getsockname.return_value = ('127.0.0.1', 12345) + mock_socket.return_value.__enter__.return_value = mock_sock_instance + + # Mock ZMQ + mock_context = Mock() + mock_zmq_context.return_value = mock_context + mock_zmq_socket = Mock() + mock_context.socket.return_value = mock_zmq_socket + + # Mock FileLock + mock_filelock.return_value.__enter__ = Mock(return_value=None) + mock_filelock.return_value.__exit__ = Mock(return_value=None) + + # 创建MessageQueue实例(不使用Ray装饰器) + queue = MessageQueue.__wrapped__(basic_config, max_queue_size=100) + + assert queue.max_queue_size == 100 + assert queue.current_param_version == 0 + assert queue.freshness_threshold == 3 + assert len(queue.queue) == 0 + assert queue.total_produced == 0 + assert queue.total_consumed == 0 + assert queue.dropped_samples == 0 + + +@pytest.fixture +def ray_setup(): + """设置Ray环境""" + if not ray.is_initialized(): + ray.init(local_mode=True, ignore_reinit_error=True) + yield + ray.shutdown() + + +@pytest.fixture +def message_queue_actor(ray_setup, basic_config): + """创建MessageQueue actor""" + with patch('message_queue.zmq.Context'), \ + patch('message_queue.FileLock'), \ + patch('socket.socket') as mock_socket: + # Mock socket setup + mock_sock_instance = Mock() + mock_sock_instance.getsockname.return_value = ('127.0.0.1', 12345) + mock_socket.return_value.__enter__.return_value = mock_sock_instance + + actor = MessageQueue.remote(basic_config, max_queue_size=10) + yield actor + ray.get(actor.shutdown.remote()) + + +class TestMessageQueueActor: + """测试MessageQueue Actor""" + + def test_put_batch_success(self, message_queue_actor, mock_data_proto): + """测试成功放入batch""" + result = ray.get(message_queue_actor.put_batch.remote( + epoch=1, + batch=mock_data_proto, + param_version=1, + rollout_metadata={"test": "data"} + )) + + assert result is True + + # 检查队列大小 + queue_size = ray.get(message_queue_actor.get_queue_size.remote()) + assert queue_size == 1 + + # 检查统计信息 + stats = ray.get(message_queue_actor.get_statistics.remote()) + assert stats["total_produced"] == 1 + assert stats["queue_size"] == 1 + + def test_put_batch_staleness_check(self, message_queue_actor, mock_data_proto): + """测试新鲜度检查""" + # 更新参数版本为5 + ray.get(message_queue_actor.update_param_version.remote(5)) + + # 尝试放入版本过旧的batch(版本差异>=3会被拒绝) + result = ray.get(message_queue_actor.put_batch.remote( + epoch=1, + batch=mock_data_proto, + param_version=2, # 5-2=3, 达到阈值 + rollout_metadata={} + )) + + assert result is False + + # 检查统计信息中的丢弃样本数 + stats = ray.get(message_queue_actor.get_statistics.remote()) + assert stats["dropped_samples"] == 1 + + def test_put_batch_queue_overflow(self, message_queue_actor, mock_data_proto): + """测试队列溢出处理""" + # 填满队列(最大容量10) + for i in range(12): # 超过最大容量 + ray.get(message_queue_actor.put_batch.remote( + epoch=1, + batch=mock_data_proto, + param_version=1, + rollout_metadata={} + )) + + # 队列大小应该保持在最大值 + queue_size = ray.get(message_queue_actor.get_queue_size.remote()) + assert queue_size == 10 + + # 检查统计信息 + stats = ray.get(message_queue_actor.get_statistics.remote()) + assert stats["dropped_samples"] == 2 # 超出的2个被丢弃 + + def test_get_batch_success(self, message_queue_actor, mock_data_proto): + """测试成功获取batch""" + # 先放入一些batch + for i in range(3): + ray.get(message_queue_actor.put_batch.remote( + epoch=i, + batch=mock_data_proto, + param_version=1, + rollout_metadata={"index": i} + )) + + # 获取2个batch + samples = ray.get(message_queue_actor.get_batch.remote(min_batch_count=2, timeout=5.0)) + + assert samples is not None + assert len(samples) == 2 + assert all(isinstance(sample, BatchSample) for sample in samples) + + # 检查队列大小减少 + queue_size = ray.get(message_queue_actor.get_queue_size.remote()) + assert queue_size == 1 + + # 检查统计信息 + stats = ray.get(message_queue_actor.get_statistics.remote()) + assert stats["total_consumed"] == 2 + + def test_get_batch_timeout(self, message_queue_actor): + """测试获取batch超时""" + # 空队列情况下获取batch应该超时 + samples = ray.get(message_queue_actor.get_batch.remote(min_batch_count=1, timeout=1.0)) + assert samples is None + + def test_update_param_version(self, message_queue_actor): + """测试更新参数版本""" + ray.get(message_queue_actor.update_param_version.remote(10)) + + stats = ray.get(message_queue_actor.get_statistics.remote()) + assert stats["current_param_version"] == 10 + + def test_clear_queue(self, message_queue_actor, mock_data_proto): + """测试清空队列""" + # 先添加一些样本 + for i in range(3): + ray.get(message_queue_actor.put_batch.remote( + epoch=i, batch=mock_data_proto, param_version=1 + )) + + # 清空队列 + ray.get(message_queue_actor.clear_queue.remote()) + + # 检查队列大小 + queue_size = ray.get(message_queue_actor.get_queue_size.remote()) + assert queue_size == 0 + + def test_get_statistics(self, message_queue_actor): + """测试获取统计信息""" + stats = ray.get(message_queue_actor.get_statistics.remote()) + + expected_keys = { + "queue_size", "total_produced", "total_consumed", + "dropped_samples", "current_param_version", "freshness_threshold" + } + assert set(stats.keys()) == expected_keys + assert isinstance(stats["queue_size"], int) + assert isinstance(stats["total_produced"], int) + assert isinstance(stats["total_consumed"], int) + + +class TestMessageQueueClient: + """测试MessageQueueClient""" + + def test_client_put_batch(self, message_queue_actor, mock_data_proto): + """测试客户端放入batch""" + client = MessageQueueClient(message_queue_actor) + + result = client.put_batch( + epoch=1, + batch=mock_data_proto, + param_version=1, + rollout_metadata={"test": "client"} + ) + + assert result is True + assert client.get_queue_size() == 1 + + def test_client_get_batch(self, message_queue_actor, mock_data_proto): + """测试客户端获取batch""" + client = MessageQueueClient(message_queue_actor) + + # 先放入一个batch + client.put_batch(epoch=1, batch=mock_data_proto, param_version=1) + + # 获取batch + samples = client.get_batch(min_batch_count=1, timeout=5.0) + + assert samples is not None + assert len(samples) == 1 + assert isinstance(samples[0], BatchSample) + + def test_client_update_param_version(self, message_queue_actor): + """测试客户端更新参数版本""" + client = MessageQueueClient(message_queue_actor) + + client.update_param_version(15) + + stats = client.get_statistics() + assert stats["current_param_version"] == 15 + + def test_client_get_queue_size(self, message_queue_actor, mock_data_proto): + """测试客户端获取队列大小""" + client = MessageQueueClient(message_queue_actor) + + assert client.get_queue_size() == 0 + + client.put_batch(epoch=1, batch=mock_data_proto, param_version=1) + assert client.get_queue_size() == 1 + + def test_client_clear_queue(self, message_queue_actor, mock_data_proto): + """测试客户端清空队列""" + client = MessageQueueClient(message_queue_actor) + + # 添加样本 + client.put_batch(epoch=1, batch=mock_data_proto, param_version=1) + assert client.get_queue_size() == 1 + + # 清空队列 + client.clear_queue() + assert client.get_queue_size() == 0 + + def test_client_shutdown(self, message_queue_actor): + """测试客户端关闭""" + client = MessageQueueClient(message_queue_actor) + + # 关闭不应该抛出异常 + client.shutdown() + + +class TestConcurrency: + """测试并发场景""" + + def test_concurrent_put_get(self, message_queue_actor, mock_data_proto): + """测试并发放入和获取""" + client = MessageQueueClient(message_queue_actor) + results = [] + + def producer(): + for i in range(5): + result = client.put_batch( + epoch=i, batch=mock_data_proto, param_version=1 + ) + results.append(("put", result)) + time.sleep(0.1) + + def consumer(): + for _ in range(3): + samples = client.get_batch(min_batch_count=1, timeout=2.0) + results.append(("get", samples is not None)) + time.sleep(0.1) + + # 启动生产者和消费者线程 + producer_thread = threading.Thread(target=producer) + consumer_thread = threading.Thread(target=consumer) + + producer_thread.start() + time.sleep(0.05) # 让生产者先开始 + consumer_thread.start() + + producer_thread.join() + consumer_thread.join() + + # 检查结果 + put_results = [r[1] for r in results if r[0] == "put"] + get_results = [r[1] for r in results if r[0] == "get"] + + assert all(put_results) # 所有放入操作都应该成功 + assert all(get_results) # 所有获取操作都应该成功 + + +# 运行测试的示例配置 +if __name__ == "__main__": + pytest.main([__file__, "-v", "--tb=short"]) diff --git a/verl/trainer/main_ppo.py b/verl/trainer/main_ppo.py index a9ea554687a..e81d0b32c1d 100644 --- a/verl/trainer/main_ppo.py +++ b/verl/trainer/main_ppo.py @@ -41,7 +41,7 @@ def main(config): # Define a function to run the PPO-like training process -def run_ppo(config) -> None: +def run_ppo(config, task_runner_class = None) -> None: """Initialize Ray cluster and run distributed PPO training process. Args: @@ -59,6 +59,9 @@ def run_ppo(config) -> None: runtime_env=get_ppo_ray_runtime_env(), num_cpus=config.ray_init.num_cpus, ) + # for recipe to change TaskRunner + if task_runner_class is None: + task_runner_class = TaskRunner # Create a remote instance of the TaskRunner class, and # Execute the `run` method of the TaskRunner instance remotely and wait for it to complete @@ -68,9 +71,9 @@ def run_ppo(config) -> None: and len(config.trainer.get("profile_steps", [])) > 0 ): nsight_options = OmegaConf.to_container(config.trainer.controller_nsight_options) - runner = TaskRunner.options(runtime_env={"nsight": nsight_options}).remote() + runner = task_runner_class.options(runtime_env={"nsight": nsight_options}).remote() else: - runner = TaskRunner.remote() + runner = task_runner_class.remote() ray.get(runner.run.remote(config)) # [Optional] get the path of the timeline trace file from the configuration, default to None From 5c9dd6d7d824d803761451179218840b1dda3947 Mon Sep 17 00:00:00 2001 From: ArronHZG Date: Fri, 25 Jul 2025 22:16:35 +0800 Subject: [PATCH 004/182] test message queue --- recipe/fully_async_policy/message_queue.py | 45 +++--- recipe/fully_async_policy/test_fully_async.py | 3 +- recipe/fully_async_policy/test_mq.py | 138 +++++++----------- 3 files changed, 81 insertions(+), 105 deletions(-) diff --git a/recipe/fully_async_policy/message_queue.py b/recipe/fully_async_policy/message_queue.py index dd9b5c5e8a9..f57d1e15325 100644 --- a/recipe/fully_async_policy/message_queue.py +++ b/recipe/fully_async_policy/message_queue.py @@ -12,19 +12,18 @@ # See the License for the specific language governing permissions and # limitations under the License. -import threadingimport time +import threading +import time import uuid from collections import deque from dataclasses import dataclass -from typing import Any +from typing import Any, Optional import ray import zmq from filelock import FileLock from omegaconf import DictConfig -from verl import DataProto - @dataclass class BatchSample: @@ -32,13 +31,13 @@ class BatchSample: batch_id: str epoch: int - data: DataProto + data: Any param_version: int timestamp: float rollout_metadata: dict[str, Any] -@ray.remote(num_cpus=24) +@ray.remote(num_cpus=1) class MessageQueue: """ 基于ZeroMQ的异步消息队列,用于Rollouter和Trainer之间的通信 @@ -49,13 +48,24 @@ def __init__(self, config: DictConfig, max_queue_size: int = 1000): self.max_queue_size = max_queue_size self.queue = deque(maxlen=max_queue_size) self.current_param_version = 0 - self.freshness_threshold = config.async_training.get("freshness_threshold", 3) + + # 安全地获取配置值,避免递归问题 + try: + if hasattr(config, "async_training") and config.async_training is not None: + self.freshness_threshold = getattr(config.async_training, "freshness_threshold", 3) + else: + self.freshness_threshold = 3 + except (AttributeError, RecursionError): + self.freshness_threshold = 3 # ZeroMQ setup - self.context = zmq.Context() + self.context = None self.socket = None self.address = None - self._setup_zmq() + try: + self._setup_zmq() + except Exception as e: + print(f"Warning: ZeroMQ setup failed: {e}. Queue will work without ZeroMQ.") # Threading for message handling self.running = True @@ -71,6 +81,9 @@ def __init__(self, config: DictConfig, max_queue_size: int = 1000): def _setup_zmq(self): """设置ZeroMQ socket""" with FileLock("/tmp/verl_message_queue.lock"): + # 初始化 ZeroMQ context + self.context = zmq.Context() + # 使用TCP socket import socket as sock @@ -82,9 +95,7 @@ def _setup_zmq(self): self.socket = self.context.socket(zmq.PAIR) self.socket.bind(self.address) - def put_batch( - self, epoch: int, batch: DataProto, param_version: int, rollout_metadata: dict[str, Any] = None - ) -> bool: + def put_batch(self, epoch: int, batch: Any, param_version: int, rollout_metadata: dict[str, Any] = None) -> bool: """ 放入一个batch样本到队列 @@ -128,7 +139,7 @@ def put_batch( return True - def get_batch(self, min_batch_count: int = 1, timeout: float = 30.0) -> list[BatchSample] | None: + def get_batch(self, min_batch_count: int = 1, timeout: float = 30.0) -> Optional[list[BatchSample]]: """ 从队列获取batch样本 @@ -203,16 +214,14 @@ def get_address(self) -> str: class MessageQueueClient: """MessageQueue的客户端,用于与MessageQueue Actor通信""" - def __init__(self, queue_actor: ray.ActorHandle): + def __init__(self, queue_actor: Any): self.queue_actor = queue_actor - def put_batch( - self, epoch: int, batch: DataProto, param_version: int, rollout_metadata: dict[str, Any] = None - ) -> bool: + def put_batch(self, epoch: int, batch: Any, param_version: int, rollout_metadata: dict[str, Any] = None) -> bool: """放入batch到队列""" return ray.get(self.queue_actor.put_batch.remote(epoch, batch, param_version, rollout_metadata)) - def get_batch(self, min_batch_count: int = 1, timeout: float = 30.0) -> list[BatchSample] | None: + def get_batch(self, min_batch_count: int = 1, timeout: float = 30.0) -> Optional[list[BatchSample]]: """从队列获取batch""" return ray.get(self.queue_actor.get_batch.remote(min_batch_count, timeout)) diff --git a/recipe/fully_async_policy/test_fully_async.py b/recipe/fully_async_policy/test_fully_async.py index b2f7f866fd7..eaa9313254a 100644 --- a/recipe/fully_async_policy/test_fully_async.py +++ b/recipe/fully_async_policy/test_fully_async.py @@ -23,8 +23,7 @@ import ray from omegaconf import OmegaConf -from recipe.fully_async_policy.message_queue import MessageQueue, MessageQueueClient -from verl import DataProto +from recipe.fully_async_policy.message_queue import DataProto, MessageQueue, MessageQueueClient logging.basicConfig(level=logging.INFO) logger = logging.getLogger(__name__) diff --git a/recipe/fully_async_policy/test_mq.py b/recipe/fully_async_policy/test_mq.py index a8aaa8add5f..488b7d12614 100644 --- a/recipe/fully_async_policy/test_mq.py +++ b/recipe/fully_async_policy/test_mq.py @@ -12,14 +12,14 @@ # See the License for the specific language governing permissions and # limitations under the License. -import pytest -import time import threading -from unittest.mock import Mock, patch, MagicMock -from omegaconf import DictConfig -import ray +import time +from unittest.mock import Mock +import pytest +import ray from message_queue import BatchSample, MessageQueue, MessageQueueClient +from omegaconf import DictConfig @pytest.fixture @@ -31,21 +31,13 @@ def mock_data_proto(): @pytest.fixture def basic_config(): """基础配置""" - return DictConfig({ - 'async_training': { - 'freshness_threshold': 3 - } - }) + return DictConfig({"async_training": {"freshness_threshold": 3}}) @pytest.fixture def queue_config(): """队列配置""" - return DictConfig({ - 'async_training': { - 'freshness_threshold': 2 - } - }) + return DictConfig({"async_training": {"freshness_threshold": 2}}) class TestBatchSample: @@ -59,7 +51,7 @@ def test_batch_sample_creation(self, mock_data_proto): data=mock_data_proto, param_version=5, timestamp=1234567890.0, - rollout_metadata={"key": "value"} + rollout_metadata={"key": "value"}, ) assert sample.batch_id == "test-123" @@ -73,29 +65,16 @@ def test_batch_sample_creation(self, mock_data_proto): class TestMessageQueue: """测试MessageQueue类(需要在非Ray环境下测试内部逻辑)""" - @patch('message_queue.zmq.Context') - @patch('message_queue.FileLock') - @patch('socket.socket') - def test_message_queue_init(self, mock_socket, mock_filelock, mock_zmq_context, basic_config): + def test_message_queue_init(self, basic_config): """测试MessageQueue初始化""" - # Mock socket - mock_sock_instance = Mock() - mock_sock_instance.getsockname.return_value = ('127.0.0.1', 12345) - mock_socket.return_value.__enter__.return_value = mock_sock_instance - - # Mock ZMQ - mock_context = Mock() - mock_zmq_context.return_value = mock_context - mock_zmq_socket = Mock() - mock_context.socket.return_value = mock_zmq_socket + # 直接创建MessageQueue实例(不使用Ray装饰器) + queue = MessageQueue.__ray_actor_class__(basic_config, max_queue_size=100) - # Mock FileLock - mock_filelock.return_value.__enter__ = Mock(return_value=None) - mock_filelock.return_value.__exit__ = Mock(return_value=None) - - # 创建MessageQueue实例(不使用Ray装饰器) - queue = MessageQueue.__wrapped__(basic_config, max_queue_size=100) + # 确保ZeroMQ初始化成功 + assert queue.context is not None + assert queue.socket is not None + # 基本属性检查 assert queue.max_queue_size == 100 assert queue.current_param_version == 0 assert queue.freshness_threshold == 3 @@ -104,6 +83,9 @@ def test_message_queue_init(self, mock_socket, mock_filelock, mock_zmq_context, assert queue.total_consumed == 0 assert queue.dropped_samples == 0 + # 清理资源 + queue.shutdown() + @pytest.fixture def ray_setup(): @@ -117,17 +99,9 @@ def ray_setup(): @pytest.fixture def message_queue_actor(ray_setup, basic_config): """创建MessageQueue actor""" - with patch('message_queue.zmq.Context'), \ - patch('message_queue.FileLock'), \ - patch('socket.socket') as mock_socket: - # Mock socket setup - mock_sock_instance = Mock() - mock_sock_instance.getsockname.return_value = ('127.0.0.1', 12345) - mock_socket.return_value.__enter__.return_value = mock_sock_instance - - actor = MessageQueue.remote(basic_config, max_queue_size=10) - yield actor - ray.get(actor.shutdown.remote()) + actor = MessageQueue.remote(basic_config, max_queue_size=10) + yield actor + ray.get(actor.shutdown.remote()) class TestMessageQueueActor: @@ -135,12 +109,11 @@ class TestMessageQueueActor: def test_put_batch_success(self, message_queue_actor, mock_data_proto): """测试成功放入batch""" - result = ray.get(message_queue_actor.put_batch.remote( - epoch=1, - batch=mock_data_proto, - param_version=1, - rollout_metadata={"test": "data"} - )) + result = ray.get( + message_queue_actor.put_batch.remote( + epoch=1, batch=mock_data_proto, param_version=1, rollout_metadata={"test": "data"} + ) + ) assert result is True @@ -159,12 +132,14 @@ def test_put_batch_staleness_check(self, message_queue_actor, mock_data_proto): ray.get(message_queue_actor.update_param_version.remote(5)) # 尝试放入版本过旧的batch(版本差异>=3会被拒绝) - result = ray.get(message_queue_actor.put_batch.remote( - epoch=1, - batch=mock_data_proto, - param_version=2, # 5-2=3, 达到阈值 - rollout_metadata={} - )) + result = ray.get( + message_queue_actor.put_batch.remote( + epoch=1, + batch=mock_data_proto, + param_version=2, # 5-2=3, 达到阈值 + rollout_metadata={}, + ) + ) assert result is False @@ -176,12 +151,11 @@ def test_put_batch_queue_overflow(self, message_queue_actor, mock_data_proto): """测试队列溢出处理""" # 填满队列(最大容量10) for i in range(12): # 超过最大容量 - ray.get(message_queue_actor.put_batch.remote( - epoch=1, - batch=mock_data_proto, - param_version=1, - rollout_metadata={} - )) + ray.get( + message_queue_actor.put_batch.remote( + epoch=1, batch=mock_data_proto, param_version=1, rollout_metadata={} + ) + ) # 队列大小应该保持在最大值 queue_size = ray.get(message_queue_actor.get_queue_size.remote()) @@ -195,12 +169,11 @@ def test_get_batch_success(self, message_queue_actor, mock_data_proto): """测试成功获取batch""" # 先放入一些batch for i in range(3): - ray.get(message_queue_actor.put_batch.remote( - epoch=i, - batch=mock_data_proto, - param_version=1, - rollout_metadata={"index": i} - )) + ray.get( + message_queue_actor.put_batch.remote( + epoch=i, batch=mock_data_proto, param_version=1, rollout_metadata={"index": i} + ) + ) # 获取2个batch samples = ray.get(message_queue_actor.get_batch.remote(min_batch_count=2, timeout=5.0)) @@ -234,9 +207,7 @@ def test_clear_queue(self, message_queue_actor, mock_data_proto): """测试清空队列""" # 先添加一些样本 for i in range(3): - ray.get(message_queue_actor.put_batch.remote( - epoch=i, batch=mock_data_proto, param_version=1 - )) + ray.get(message_queue_actor.put_batch.remote(epoch=i, batch=mock_data_proto, param_version=1)) # 清空队列 ray.get(message_queue_actor.clear_queue.remote()) @@ -250,8 +221,12 @@ def test_get_statistics(self, message_queue_actor): stats = ray.get(message_queue_actor.get_statistics.remote()) expected_keys = { - "queue_size", "total_produced", "total_consumed", - "dropped_samples", "current_param_version", "freshness_threshold" + "queue_size", + "total_produced", + "total_consumed", + "dropped_samples", + "current_param_version", + "freshness_threshold", } assert set(stats.keys()) == expected_keys assert isinstance(stats["queue_size"], int) @@ -266,12 +241,7 @@ def test_client_put_batch(self, message_queue_actor, mock_data_proto): """测试客户端放入batch""" client = MessageQueueClient(message_queue_actor) - result = client.put_batch( - epoch=1, - batch=mock_data_proto, - param_version=1, - rollout_metadata={"test": "client"} - ) + result = client.put_batch(epoch=1, batch=mock_data_proto, param_version=1, rollout_metadata={"test": "client"}) assert result is True assert client.get_queue_size() == 1 @@ -338,9 +308,7 @@ def test_concurrent_put_get(self, message_queue_actor, mock_data_proto): def producer(): for i in range(5): - result = client.put_batch( - epoch=i, batch=mock_data_proto, param_version=1 - ) + result = client.put_batch(epoch=i, batch=mock_data_proto, param_version=1) results.append(("put", result)) time.sleep(0.1) From 3fd7020e0d34f2f03d3c0e0ffb5f9bfb84e873c2 Mon Sep 17 00:00:00 2001 From: ArronHZG Date: Wed, 30 Jul 2025 14:03:27 +0800 Subject: [PATCH 005/182] main --- recipe/fully_async_policy/RollouterActor.py | 75 -- recipe/fully_async_policy/fully_async_main.py | 597 ++++++++++----- .../fully_async_rollouter.py | 681 ++++++++++++++++++ recipe/fully_async_policy/rollouter.py | 413 ----------- 4 files changed, 1103 insertions(+), 663 deletions(-) delete mode 100644 recipe/fully_async_policy/RollouterActor.py create mode 100644 recipe/fully_async_policy/fully_async_rollouter.py delete mode 100644 recipe/fully_async_policy/rollouter.py diff --git a/recipe/fully_async_policy/RollouterActor.py b/recipe/fully_async_policy/RollouterActor.py deleted file mode 100644 index fb5212b577a..00000000000 --- a/recipe/fully_async_policy/RollouterActor.py +++ /dev/null @@ -1,75 +0,0 @@ -# Copyright 2025 Meituan Ltd. and/or its affiliates -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -import ray - -from recipe.fully_async_policy.rollouter import Rollouter - - -@ray.remote -class RollouterActor: - """Rollouter的Ray Actor包装器""" - - def __init__( - self, - config, - tokenizer, - role_worker_mapping, - resource_pool_manager, - ray_worker_group_cls, - processor=None, - train_dataset=None, - collate_fn=None, - train_sampler=None, - device_name="cuda", - ): - self.rollouter = Rollouter( - config=config, - tokenizer=tokenizer, - role_worker_mapping=role_worker_mapping, - resource_pool_manager=resource_pool_manager, - ray_worker_group_cls=ray_worker_group_cls, - processor=processor, - train_dataset=train_dataset, - collate_fn=collate_fn, - train_sampler=train_sampler, - device_name=device_name, - ) - - def init_workers(self): - """初始化worker""" - return self.rollouter.init_workers() - - def set_message_queue_client(self, message_queue_client): - """设置消息队列客户端""" - return self.rollouter.set_message_queue_client(message_queue_client) - - def set_parameter_synchronizer(self, param_synchronizer): - """设置参数同步器""" - return self.rollouter.set_parameter_synchronizer(param_synchronizer) - - def update_rollout_weights(self, param_version: int): - """更新rollout权重""" - return self.rollouter.update_rollout_weights(param_version) - - def fit(self): - """开始生成循环""" - return self.rollouter.fit() - - def shutdown(self): - """关闭rollouter""" - return self.rollouter.shutdown() - - def get_statistics(self): - """获取统计信息""" - return self.rollouter.get_statistics() diff --git a/recipe/fully_async_policy/fully_async_main.py b/recipe/fully_async_policy/fully_async_main.py index f0689a5d28c..e57e3e119b7 100644 --- a/recipe/fully_async_policy/fully_async_main.py +++ b/recipe/fully_async_policy/fully_async_main.py @@ -13,75 +13,203 @@ # limitations under the License. import logging -import threading -import time import os +import signal import socket +import threading +import time +from pprint import pprint import hydra import ray from omegaconf import OmegaConf -from recipe.fully_async_policy.RollouterActor import RollouterActor -from verl.experimental.dataset.sampler import AbstractSampler -from verl.trainer.constants_ppo import get_ppo_ray_runtime_env -from verl.trainer.ppo.ray_trainer import RayPPOTrainer -from verl.trainer.ppo.reward import load_reward_manager -from verl.utils.device import is_cuda_available -from verl.utils.import_utils import load_extern_type - -import hydra -import ray - +from recipe.fully_async_policy.fully_async_rollouter import FullyAsyncRollouter +from recipe.fully_async_policy.fully_async_trainer import FullyAsyncTrainer from recipe.fully_async_policy.message_queue import MessageQueue, MessageQueueClient -from verl.trainer.main_ppo import create_rl_dataset, create_rl_sampler, run_ppo +from recipe.fully_async_policy.param_sync import AsyncParameterSynchronizer +from verl.trainer.ppo.ray_trainer import ResourcePoolManager, Role from verl.trainer.ppo.reward import load_reward_manager -from verl.utils.dataset.rl_dataset import collate_fn - -from fully_async_trainer import FullyAsyncTrainer +from verl.utils.fs import copy_to_local logger = logging.getLogger(__name__) def setup_logging(): """设置日志配置""" - logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(name)s - %(levelname)s - %(message)s") + logging.basicConfig( + level=logging.INFO, + format="%(asctime)s - %(name)s - %(levelname)s - %(message)s", + handlers=[logging.StreamHandler(), logging.FileHandler("fully_async_training.log")], + ) -@ray.remote(num_cpus=1) # please make sure main_task is not scheduled on head -class FullyAsyncTaskRunner: - """Ray remote class for executing distributed PPO training tasks. +def create_resource_pool_manager(config, roles: list) -> ResourcePoolManager: + """ + 创建资源池管理器 - This class encapsulates the main training logic and runs as a Ray remote actor - to enable distributed execution across multiple nodes and GPUs. + Args: + config: 配置对象 + roles: 需要创建资源池的角色列表 + + Returns: + ResourcePoolManager: 资源池管理器 """ + # 构建资源池规格 + resource_pool_spec = {} + mapping = {} + + # Actor/Critic资源池(训练相关) + if any(role in roles for role in [Role.Actor, Role.Critic, Role.RefPolicy, Role.RewardModel]): + assert config.trainer.n_gpus_per_node > 0, "config.trainer.n_gpus_per_node must be greater than 0" + assert config.trainer.nnodes > 0, "config.trainer.nnodes must be greater than 0" + + trainer_pool = [config.trainer.n_gpus_per_node] * config.trainer.nnodes + resource_pool_spec["trainer_pool"] = trainer_pool + + # 训练相关角色映射到同一个资源池 + for role in [Role.Actor, Role.Critic, Role.RefPolicy, Role.RewardModel]: + if role in roles: + mapping[role] = "trainer_pool" + + # Rollout资源池 + if Role.Rollout in roles: + assert config.rollout.n_gpus_per_node > 0, "config.rollout.n_gpus_per_node must be greater than 0" + assert config.rollout.nnodes > 0, "config.rollout.nnodes must be greater than 0" + + rollout_pool = [config.rollout.n_gpus_per_node] * config.rollout.nnodes + resource_pool_spec["rollout_pool"] = rollout_pool + mapping[Role.Rollout] = "rollout_pool" + + logger.info(f"Resource pool specification: {resource_pool_spec}") + logger.info(f"Role mapping: {mapping}") + + return ResourcePoolManager(resource_pool_spec=resource_pool_spec, mapping=mapping) + + +def create_role_worker_mapping(config): + """ + 创建角色到worker类的映射 + + Args: + config: 配置对象 + + Returns: + dict: 角色到worker类的映射 + """ + # 根据策略选择worker类 + if config.actor_rollout_ref.actor.strategy == "fsdp2": + assert config.actor_rollout_ref.actor.strategy == config.critic.strategy + from recipe.one_step_off_policy.fsdp_workers import ( + ActorRolloutRefWorker, + AsyncActorRolloutRefWorker, + CriticWorker, + RolloutWorker, + ) + from verl.single_controller.ray import RayWorkerGroup + + actor_rollout_cls = ( + AsyncActorRolloutRefWorker if config.actor_rollout_ref.rollout.mode == "async" else ActorRolloutRefWorker + ) + ray_worker_group_cls = RayWorkerGroup + + elif config.actor_rollout_ref.actor.strategy == "megatron": + assert config.actor_rollout_ref.actor.strategy == config.critic.strategy + from recipe.one_step_off_policy.megatron_workers import ( + ActorRolloutRefWorker, + AsyncActorRolloutRefWorker, + CriticWorker, + RolloutWorker, + ) + from verl.single_controller.ray.megatron import NVMegatronRayWorkerGroup + + actor_rollout_cls = ( + AsyncActorRolloutRefWorker if config.actor_rollout_ref.rollout.mode == "async" else ActorRolloutRefWorker + ) + ray_worker_group_cls = NVMegatronRayWorkerGroup + + else: + raise NotImplementedError(f"Unsupported strategy: {config.actor_rollout_ref.actor.strategy}") + + role_worker_mapping = { + Role.Actor: ray.remote(actor_rollout_cls), + Role.Rollout: ray.remote(RolloutWorker), + Role.Critic: ray.remote(CriticWorker), + } + + # 添加reward model(如果启用) + if config.reward_model.enable: + if config.reward_model.strategy == "fsdp2": + from verl.workers.fsdp_workers import RewardModelWorker + elif config.reward_model.strategy == "megatron": + from verl.workers.megatron_workers import RewardModelWorker + else: + raise NotImplementedError(f"Unsupported reward model strategy: {config.reward_model.strategy}") + + role_worker_mapping[Role.RewardModel] = ray.remote(RewardModelWorker) + + # 添加reference policy(如果需要KL loss或reward) + if config.algorithm.use_kl_in_reward or config.actor_rollout_ref.actor.use_kl_loss: + role_worker_mapping[Role.RefPolicy] = ray.remote(ActorRolloutRefWorker) + + return role_worker_mapping, ray_worker_group_cls + + +@ray.remote(num_cpus=1) +class FullyAsyncTaskRunner: + """ + Ray remote class for executing distributed PPO training tasks. + """ + + def __init__(self): + self.running = False + self.components = {} + self.shutdown_event = threading.Event() def run(self, config): """运行完全异步的PPO训练""" setup_logging() - logger.info("Starting fully async PPO training...") - # 创建数据集和采样器 - logger.info("Creating dataset and sampler...") - from verl.utils import hf_processor, hf_tokenizer + # 设置信号处理 + self._setup_signal_handlers() + # 初始化基础组件 + self._initialize_components(config) + # 启动训练流程 + self._run_training_loop() + + self._cleanup_resources() - # Print the initial configuration. `resolve=True` will evaluate symbolic values. - from pprint import pprint + def _setup_signal_handlers(self): + """设置信号处理器""" - from omegaconf import OmegaConf + def signal_handler(signum, frame): + logger.info(f"Received signal {signum}, initiating shutdown...") + self.running = False + self.shutdown_event.set() - from verl.utils.fs import copy_to_local + signal.signal(signal.SIGINT, signal_handler) + signal.signal(signal.SIGTERM, signal_handler) + def _initialize_components(self, config) -> None: + """ + 初始化所有组件 + + Args: + config: 配置对象 + + Returns: + bool: 是否初始化成功 + """ + # 打印配置信息 print(f"TaskRunner hostname: {socket.gethostname()}, PID: {os.getpid()}") pprint(OmegaConf.to_container(config, resolve=True)) OmegaConf.resolve(config) - # Download the checkpoint from HDFS to the local machine. - # `use_shm` determines whether to use shared memory, which could lead to faster model loading if turned on + # 初始化模型路径和tokenizer + logger.info("Initializing model and tokenizer...") local_path = copy_to_local( config.actor_rollout_ref.model.path, use_shm=config.actor_rollout_ref.model.get("use_shm", False) ) - # Instantiate the tokenizer and processor. from verl.utils import hf_processor, hf_tokenizer @@ -90,195 +218,314 @@ def run(self, config): # Used for multimodal LLM, could be None processor = hf_processor(local_path, trust_remote_code=trust_remote_code, use_fast=True) - # Define worker classes based on the actor strategy. - if config.actor_rollout_ref.actor.strategy == "fsdp2": - assert config.actor_rollout_ref.actor.strategy == config.critic.strategy - from verl.single_controller.ray import RayWorkerGroup - - from recipe.one_step_off_policy.fsdp_workers import ( - ActorRolloutRefWorker, - AsyncActorRolloutRefWorker, - CriticWorker, - RolloutWorker, - ) - - actor_rollout_cls = ( - AsyncActorRolloutRefWorker - if config.actor_rollout_ref.rollout.mode == "async" - else ActorRolloutRefWorker - ) - ray_worker_group_cls = RayWorkerGroup - - elif config.actor_rollout_ref.actor.strategy == "megatron": - assert config.actor_rollout_ref.actor.strategy == config.critic.strategy - from verl.single_controller.ray.megatron import NVMegatronRayWorkerGroup - - from recipe.one_step_off_policy.megatron_workers import ( - ActorRolloutRefWorker, - AsyncActorRolloutRefWorker, - CriticWorker, - RolloutWorker, - ) - - actor_rollout_cls = ( - AsyncActorRolloutRefWorker - if config.actor_rollout_ref.rollout.mode == "async" - else ActorRolloutRefWorker - ) - ray_worker_group_cls = NVMegatronRayWorkerGroup - - else: - raise NotImplementedError + self.components["tokenizer"] = tokenizer + self.components["processor"] = processor - from recipe.one_step_off_policy.ray_trainer import ResourcePoolManager, Role + # 创建worker映射和资源池 + logger.info("Creating worker mapping and resource pools...") + role_worker_mapping, ray_worker_group_cls = create_role_worker_mapping(config) + self.components["role_worker_mapping"] = role_worker_mapping + self.components["ray_worker_group_cls"] = ray_worker_group_cls - role_worker_mapping = { - Role.Actor: ray.remote(actor_rollout_cls), - Role.Rollout: ray.remote(RolloutWorker), - Role.Critic: ray.remote(CriticWorker), - } - - global_pool_id = "actor_pool" - rollout_pool_id = "rollout_pool" + # 创建数据集 + logger.info("Creating datasets...") + from verl.trainer.main_ppo import create_rl_dataset, create_rl_sampler + from verl.utils.dataset.rl_dataset import collate_fn - assert config.trainer.n_gpus_per_node > 0, "config.trainer.n_gpus_per_node must be greater than 0" - assert config.trainer.nnodes > 0, "config.trainer.nnodes must be greater than 0" - assert config.rollout.n_gpus_per_node > 0, "config.rollout.n_gpus_per_node must be greater than 0" - assert config.rollout.nnodes > 0, "config.rollout.nnodes must be greater than 0" + train_dataset = create_rl_dataset(config.data.train_files, config.data, tokenizer, processor) + val_dataset = create_rl_dataset(config.data.val_files, config.data, tokenizer, processor) + train_sampler = create_rl_sampler(config.data, train_dataset) - actor_pool = [config.trainer.n_gpus_per_node] * config.trainer.nnodes - rollout_pool = [config.rollout.n_gpus_per_node] * config.rollout.nnodes + self.components["train_dataset"] = train_dataset + self.components["val_dataset"] = val_dataset + self.components["train_sampler"] = train_sampler + self.components["collate_fn"] = collate_fn - resource_pool_spec = { - "actor_pool": actor_pool, - "rollout_pool": rollout_pool, - } - mapping = { - Role.Actor: global_pool_id, - Role.Rollout: rollout_pool_id, - Role.Critic: global_pool_id, - } - print(f"resource_pool_spec: {resource_pool_spec}") - # We should adopt a multi-source reward function here: - # - for rule-based rm, we directly call a reward score - # - for model-based rm, we call a model - # - for code related prompt, we send to a sandbox if there are test cases - # finally, we combine all the rewards together - # The reward type depends on the tag of the data - if config.reward_model.enable: - if config.reward_model.strategy == "fsdp2": - from verl.workers.fsdp_workers import RewardModelWorker - elif config.reward_model.strategy == "megatron": - from verl.workers.megatron_workers import RewardModelWorker - else: - raise NotImplementedError - role_worker_mapping[Role.RewardModel] = ray.remote(RewardModelWorker) - mapping[Role.RewardModel] = global_pool_id - - # Add a reference policy worker if KL loss or KL reward is used. - if config.algorithm.use_kl_in_reward or config.actor_rollout_ref.actor.use_kl_loss: - role_worker_mapping[Role.RefPolicy] = ray.remote(ActorRolloutRefWorker) - mapping[Role.RefPolicy] = global_pool_id - - # Load the reward manager for training and validation. + # 创建奖励函数 + logger.info("Loading reward functions...") reward_fn = load_reward_manager( config, tokenizer, num_examine=0, **config.reward_model.get("reward_kwargs", {}) ) val_reward_fn = load_reward_manager( config, tokenizer, num_examine=1, **config.reward_model.get("reward_kwargs", {}) ) - resource_pool_manager = ResourcePoolManager(resource_pool_spec=resource_pool_spec, mapping=mapping) - - from verl.utils.dataset.rl_dataset import collate_fn + self.components["reward_fn"] = reward_fn + self.components["val_reward_fn"] = val_reward_fn - # Create training and validation datasets. - train_dataset = create_rl_dataset(config.data.train_files, config.data, tokenizer, processor) - val_dataset = create_rl_dataset(config.data.val_files, config.data, tokenizer, processor) - train_sampler = create_rl_sampler(config.data, train_dataset) - - # 1. 创建MessageQueue + # 创建MessageQueue logger.info("Creating MessageQueue...") - # todo max_queue_size auto compute max_queue_size = config.async_training.get("max_queue_size", 1000) message_queue = MessageQueue.remote(config, max_queue_size) message_queue_client = MessageQueueClient(message_queue) - # 2. 创建Rollouter Actor + self.components["message_queue"] = message_queue + self.components["message_queue_client"] = message_queue_client + + # 创建Rollouter logger.info("Creating Rollouter...") - rollouter_actor = RollouterActor.remote( + self._create_rollouter(config) + + # 创建Trainer + logger.info("Creating FullyAsyncTrainer...") + self._create_trainer(config) + + # 设置参数同步 + logger.info("Setting up parameter synchronization...") + param_synchronizer = AsyncParameterSynchronizer( config=config, - tokenizer=tokenizer, - role_worker_mapping={Role.Rollout: role_worker_mapping[Role.Rollout]}, + actor_wg=self.components["trainer"].actor_wg, + rollouter=self.components["rollouter"], + ) + self.components["param_synchronizer"] = param_synchronizer + logger.info("All components initialized successfully") + + def _create_rollouter(self, config) -> None: + """创建Rollouter""" + rollouter = FullyAsyncRollouter.remote( + config=config, + tokenizer=self.components["tokenizer"], + role_worker_mapping={Role.Rollout: self.components["role_worker_mapping"][Role.Rollout]}, resource_pool_manager=create_resource_pool_manager(config, roles=[Role.Rollout]), - ray_worker_group_cls=RayWorkerGroup, - processor=processor, - train_dataset=train_dataset, - collate_fn=collate_fn, - train_sampler=train_sampler, + ray_worker_group_cls=self.components["ray_worker_group_cls"], + processor=self.components["processor"], + train_dataset=self.components["train_dataset"], + collate_fn=self.components["collate_fn"], + train_sampler=self.components["train_sampler"], device_name=config.trainer.device, ) # 初始化Rollouter - ray.get(rollouter_actor.init_workers.remote()) - ray.get(rollouter_actor.set_message_queue_client.remote(message_queue_client)) + init_future = rollouter.init_workers.remote() + ray.get(init_future, timeout=60.0) - # 3. 创建Trainer - logger.info("Creating FullyAsyncTrainer...") + set_queue_future = rollouter.set_message_queue_client.remote(self.components["message_queue_client"]) + ray.get(set_queue_future, timeout=10.0) + + self.components["rollouter"] = rollouter + logger.info("Rollouter created and initialized successfully") + + def _create_trainer(self, config) -> None: + """创建Trainer""" + # 创建trainer角色映射(排除Rollout) trainer_role_mapping = { - role: worker_cls for role, worker_cls in role_worker_mapping.items() if role != Role.Rollout + role: worker_cls + for role, worker_cls in self.components["role_worker_mapping"].items() + if role != Role.Rollout } - # 创建奖励函数 - reward_fn, val_reward_fn = load_reward_manager(config, tokenizer) - - trainer = FullyAsyncTrainer( + trainer = FullyAsyncTrainer.remote( config=config, - tokenizer=tokenizer, + tokenizer=self.components["tokenizer"], role_worker_mapping=trainer_role_mapping, resource_pool_manager=create_resource_pool_manager(config, roles=list(trainer_role_mapping.keys())), - ray_worker_group_cls=RayWorkerGroup, - processor=processor, - reward_fn=reward_fn, - val_reward_fn=val_reward_fn, - train_dataset=train_dataset, - val_dataset=val_dataset, - collate_fn=collate_fn, - train_sampler=train_sampler, + ray_worker_group_cls=self.components["ray_worker_group_cls"], + processor=self.components["processor"], + reward_fn=self.components["reward_fn"], + val_reward_fn=self.components["val_reward_fn"], + train_dataset=self.components["train_dataset"], + val_dataset=self.components["val_dataset"], + collate_fn=self.components["collate_fn"], + train_sampler=self.components["train_sampler"], device_name=config.trainer.device, ) # 初始化Trainer trainer.init_workers() - trainer.set_message_queue_client(message_queue_client) - trainer.set_rollouter_actor(rollouter_actor) + trainer.set_message_queue_client(self.components["message_queue_client"]) + trainer.set_rollouter(self.components["rollouter"]) - # 4. 设置参数同步 - logger.info("Setting up parameter synchronization...") - # param_synchronizer = AsyncParameterSynchronizer( - # config=config, actor_wg=trainer.actor_wg, rollouter_actor=rollouter_actor - # ) + self.components["trainer"] = trainer + logger.info("FullyAsyncTrainer created and initialized successfully") - # 5. 启动Rollouter(在后台线程中) - logger.info("Starting Rollouter in background...") + def _run_training_loop(self): + """运行训练循环""" + self.running = True - def run_rollouter(): + logger.info("Starting Rollouter in background...") + rollouter_future = self.components["rollouter"].fit.remote() + time.sleep(2.0) + trainer_future = self.components["trainer"].fit.remote() + self._monitor_components() + ray.get(rollouter_future) + ray.get(trainer_future) + + logger.info("Training completed or interrupted") + + def _run_rollouter(self): + try: + ray.get(self.components["rollouter"].fit.remote()) + except Exception as e: + logger.error(f"Rollouter error: {e}") + self.running = False + self.shutdown_event.set() + + def _run_trainer(self): + """运行trainer""" + try: + self.components["trainer"].fit() + except Exception as e: + logger.error(f"Trainer error: {e}") + finally: + self.running = False + self.shutdown_event.set() + + def _monitor_components(self): + """监控组件状态""" + logger.info("Starting component monitoring...") + + last_stats_time = time.time() + stats_interval = 60.0 # 60秒报告一次统计 + + while self.running and not self.shutdown_event.is_set(): try: - ray.get(rollouter_actor.fit.remote()) + # 等待一段时间或直到收到停止信号 + if self.shutdown_event.wait(timeout=10.0): + break + + # 定期报告统计信息 + current_time = time.time() + if current_time - last_stats_time >= stats_interval: + self._log_component_statistics() + last_stats_time = current_time + + # 检查组件健康状态 + self._check_component_health() + except Exception as e: - logger.error(f"Rollouter error: {e}") + logger.error(f"Error in component monitoring: {e}") + + logger.info("Component monitoring stopped") + + def _log_component_statistics(self): + """记录组件统计信息""" + try: + # 获取Trainer统计 + trainer_stats = self.components["trainer"].get_statistics() + + # 获取Rollouter统计 + rollouter_stats = ray.get(self.components["rollouter"].get_statistics.remote(), timeout=5.0) + + # 获取队列统计 + queue_stats = self.components["message_queue_client"].get_statistics() - rollouter_thread = threading.Thread(target=run_rollouter, daemon=True) - rollouter_thread.start() + logger.info("=== Component Statistics ===") + logger.info( + f"Trainer - Steps: {trainer_stats['global_steps']}, " + f"Samples: {trainer_stats['processed_samples']}, " + f"Param version: {trainer_stats['current_param_version']}" + ) + + logger.info( + f"Rollouter - Generated: {rollouter_stats['total_generated_samples']}, " + f"Dropped: {rollouter_stats['dropped_stale_samples']}, " + f"Errors: {rollouter_stats['generation_errors']}" + ) - # 6. 启动Trainer(主线程) - logger.info("Starting FullyAsyncTrainer...") - trainer.fit() + logger.info( + f"Queue - Size: {queue_stats['queue_size']}, " + f"Produced: {queue_stats['total_produced']}, " + f"Consumed: {queue_stats['total_consumed']}" + ) + + except Exception as e: + logger.error(f"Error getting component statistics: {e}") + + def _check_component_health(self): + """检查组件健康状态""" + try: + # 检查trainer是否仍在运行 + if hasattr(self.components["trainer"], "global_steps"): + current_steps = self.components["trainer"].global_steps + # 可以添加更多健康检查逻辑 + print(current_steps) + + # 检查rollouter是否仍在运行 + rollouter_stats = ray.get(self.components["rollouter"].get_statistics.remote(), timeout=5.0) + + if not rollouter_stats["is_running"]: + logger.warning("Rollouter is not running!") + # 可以尝试重启或报告错误 + + except Exception as e: + logger.warning(f"Health check failed: {e}") + + def _cleanup_resources(self): + """清理资源""" + logger.info("Cleaning up resources...") + + try: + # 停止Rollouter + if "rollouter" in self.components: + logger.info("Shutting down Rollouter...") + try: + shutdown_future = self.components["rollouter"].shutdown.remote() + ray.get(shutdown_future, timeout=10.0) + except Exception as e: + logger.warning(f"Error shutting down Rollouter: {e}") + + # 清理MessageQueue + if "message_queue_client" in self.components: + logger.info("Cleaning up MessageQueue...") + try: + self.components["message_queue_client"].shutdown() + except Exception as e: + logger.warning(f"Error cleaning up MessageQueue: {e}") + + # 清理参数同步器 + if "param_synchronizer" in self.components: + logger.info("Cleaning up parameter synchronizer...") + # TODO: 添加参数同步器的清理逻辑 + + logger.info("Resource cleanup completed") + + except Exception as e: + logger.error(f"Error during cleanup: {e}") + + def get_training_status(self) -> dict: + """获取训练状态""" + if not self.running or "trainer" not in self.components: + return {"status": "not_running"} + + try: + trainer_stats = self.components["trainer"].get_statistics() + rollouter_stats = ray.get(self.components["rollouter"].get_statistics.remote(), timeout=5.0) + + return { + "status": "running", + "trainer_stats": trainer_stats, + "rollouter_stats": rollouter_stats, + } + except Exception as e: + logger.error(f"Error getting training status: {e}") + return {"status": "error", "error": str(e)} @hydra.main(config_path="config", config_name="fully_async_ppo_trainer", version_base=None) def main(config): """主入口函数""" - run_ppo(config, FullyAsyncTaskRunner) + from verl.trainer.main_ppo import run_ppo + + # 确保异步训练配置存在 + if not hasattr(config, "async_training"): + # 设置默认异步训练配置 + config.async_training = OmegaConf.create( + { + "freshness_threshold": 3, + "max_staleness_allowed": 5, + "max_queue_size": 1000, + "min_batch_count": 1, + "batch_timeout": 30.0, + "generation_timeout": 30.0, + "batch_generation_interval": 0.1, + "max_sync_retries": 3, + "sync_timeout": 30.0, + "sync_retry_delay": 1.0, + } + ) + logger.info("Using default async training configuration") + + logger.info("Starting fully async PPO training with improved architecture") + run_ppo(config, task_runner_class=FullyAsyncTaskRunner) if __name__ == "__main__": diff --git a/recipe/fully_async_policy/fully_async_rollouter.py b/recipe/fully_async_policy/fully_async_rollouter.py new file mode 100644 index 00000000000..c127b242704 --- /dev/null +++ b/recipe/fully_async_policy/fully_async_rollouter.py @@ -0,0 +1,681 @@ +# Copyright 2025 Meituan Ltd. and/or its affiliates +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import logging +import threading +import time +import uuid +from concurrent.futures import ThreadPoolExecutor +from typing import Optional + +import numpy as np +import ray +from omegaconf import OmegaConf +from torch.utils.data import Dataset, Sampler + +from recipe.fully_async_policy.message_queue import MessageQueueClient +from verl import DataProto +from verl.single_controller.ray import RayClassWithInitArgs, RayWorkerGroup +from verl.single_controller.ray.base import create_colocated_worker_cls +from verl.trainer.ppo.ray_trainer import ResourcePoolManager, Role, WorkerType +from verl.utils.debug import marked_timer + +logger = logging.getLogger(__name__) + + +class RolloutController: + """控制rollout的暂停和恢复 - 改进的控制机制""" + + def __init__(self): + self.is_paused = False + self.pause_event = threading.Event() + self.resume_event = threading.Event() + self.resume_event.set() # 初始状态为可运行 + self.pending_requests = [] + self.lock = threading.RLock() + self.pause_count = 0 + + def pause(self, timeout: Optional[float] = None) -> bool: + """ + 暂停rollout + + Args: + timeout: 暂停超时时间,如果为None则无限等待 + + Returns: + bool: 是否成功暂停 + """ + with self.lock: + if not self.is_paused: + self.is_paused = True + self.resume_event.clear() + self.pause_event.set() + self.pause_count += 1 + logger.info(f"Rollout paused (count: {self.pause_count})") + return True + else: + logger.debug("Rollout already paused") + return True + + def resume(self) -> bool: + """ + 恢复rollout + + Returns: + bool: 是否成功恢复 + """ + with self.lock: + if self.is_paused: + self.is_paused = False + self.pause_event.clear() + self.resume_event.set() + logger.info("Rollout resumed") + return True + else: + logger.debug("Rollout already running") + return True + + def wait_if_paused(self, timeout: float = None) -> bool: + """ + 如果被暂停则等待恢复 + + Args: + timeout: 等待超时时间 + + Returns: + bool: 是否成功等待(未超时) + """ + if self.is_paused: + logger.debug(f"Waiting for resume (timeout: {timeout})") + return self.resume_event.wait(timeout) + return True + + def is_pause_requested(self) -> bool: + """检查是否有暂停请求""" + return self.pause_event.is_set() + + def get_status(self) -> dict: + """获取控制器状态""" + with self.lock: + return { + "is_paused": self.is_paused, + "pause_count": self.pause_count, + "has_pending_requests": len(self.pending_requests) > 0, + } + + +class Rollouter: + """ + 异步样本生成器,负责持续生成训练样本并放入MessageQueue + 基于OneStepOffRayTrainer的成熟实现改进 + """ + + def __init__( + self, + config, + tokenizer, + role_worker_mapping: dict[Role, WorkerType], + resource_pool_manager: ResourcePoolManager, + ray_worker_group_cls: RayWorkerGroup = RayWorkerGroup, + processor=None, + train_dataset: Dataset | None = None, + collate_fn=None, + train_sampler: Sampler | None = None, + device_name="cuda", + ): + self.config = config + self.tokenizer = tokenizer + self.processor = processor + self.role_worker_mapping = role_worker_mapping + self.resource_pool_manager = resource_pool_manager + self.ray_worker_group_cls = ray_worker_group_cls + self.device_name = device_name + + # 数据相关 + self.train_dataset = train_dataset + self.collate_fn = collate_fn + self.train_sampler = train_sampler + + # Rollout控制 + self.rollout_controller = RolloutController() + self.current_param_version = 0 + + # 新鲜度控制 - 改进的配置管理 + async_config = config.async_training + self.freshness_threshold = async_config.get("freshness_threshold", 3) + self.max_staleness_allowed = async_config.get("max_staleness_allowed", 5) + self.generation_timeout = async_config.get("generation_timeout", 30.0) + self.batch_generation_interval = async_config.get("batch_generation_interval", 0.1) + + # 统计信息 + self.total_generated_samples = 0 + self.dropped_stale_samples = 0 + self.generation_errors = 0 + self.param_sync_requests = 0 + + # Worker groups + self.rollout_wg = None + self.message_queue_client = None + + # 运行状态 + self.running = False + self.generation_thread = None + self.thread_executor = ThreadPoolExecutor(max_workers=2) + + # 参数同步相关 + self.param_synchronizer = None + self.last_sync_time = 0 + self.sync_in_progress = False + self.sync_lock = threading.Lock() + + # 异步rollout模式 + self.async_rollout_mode = config.actor_rollout_ref.rollout.mode == "async" + + self._validate_config() + + def _validate_config(self): + """验证配置""" + required_configs = [ + "data.train_batch_size", + "actor_rollout_ref.rollout.n", + "async_training.freshness_threshold", + ] + + for config_path in required_configs: + if not OmegaConf.select(self.config, config_path): + logger.warning(f"Missing recommended config: {config_path}") + + # 验证异步训练配置 + if not hasattr(self.config, "async_training"): + raise ValueError("Missing async_training configuration") + + def init_workers(self): + """初始化rollout workers - 参考OneStepOffRayTrainer的实现""" + logger.info("Initializing Rollouter workers...") + + self.resource_pool_manager.create_resource_pool() + self.resource_pool_to_cls = {pool: {} for pool in self.resource_pool_manager.resource_pool_dict.values()} + + # 只创建rollout worker + resource_pool = self.resource_pool_manager.get_resource_pool(Role.Rollout) + role_cls = RayClassWithInitArgs( + cls=self.role_worker_mapping[Role.Rollout], + config=self.config.actor_rollout_ref, + role="rollout", + ) + self.resource_pool_to_cls[resource_pool]["rollout"] = role_cls + + # 初始化WorkerGroup + all_wg = {} + wg_kwargs = {} + if OmegaConf.select(self.config.trainer, "ray_wait_register_center_timeout") is not None: + wg_kwargs["ray_wait_register_center_timeout"] = self.config.trainer.ray_wait_register_center_timeout + if OmegaConf.select(self.config.trainer, "profile_steps") is not None: + wg_kwargs["profile_steps"] = OmegaConf.select(self.config.trainer, "profile_steps") + if OmegaConf.select(self.config.trainer, "worker_nsight_options") is not None: + wg_kwargs["worker_nsight_options"] = OmegaConf.to_container( + OmegaConf.select(self.config.trainer, "worker_nsight_options") + ) + + for resource_pool, class_dict in self.resource_pool_to_cls.items(): + worker_dict_cls = create_colocated_worker_cls(class_dict=class_dict) + wg_dict = self.ray_worker_group_cls( + resource_pool=resource_pool, + ray_cls_with_init=worker_dict_cls, + device_name=self.device_name, + **wg_kwargs, + ) + spawn_wg = wg_dict.spawn(prefix_set=class_dict.keys()) + all_wg.update(spawn_wg) + + self.rollout_wg = all_wg["rollout"] + self.rollout_wg.init_model() + + # 初始化异步rollout管理器(如果需要) + if self.async_rollout_mode: + self._init_async_rollout_manager() + + logger.info("Rollouter workers initialized successfully") + + def _init_async_rollout_manager(self): + """初始化异步rollout管理器""" + try: + from verl.workers.rollout.async_server import AsyncLLMServerManager + + self.async_rollout_manager = AsyncLLMServerManager( + config=self.config, + worker_group=self.rollout_wg, + ) + logger.info("Async rollout manager initialized") + except Exception as e: + logger.warning(f"Failed to initialize async rollout manager: {e}") + self.async_rollout_mode = False + + def set_message_queue_client(self, message_queue_client: MessageQueueClient): + """设置消息队列客户端""" + self.message_queue_client = message_queue_client + + def set_parameter_synchronizer(self, param_synchronizer): + """设置参数同步器""" + self.param_synchronizer = param_synchronizer + + def update_rollout_weights(self, param_version: int) -> bool: + """ + 更新rollout模型参数 - 改进的参数同步实现 + 这个方法由外部Trainer调用 + + Args: + param_version: 新的参数版本号 + + Returns: + bool: 是否成功更新参数 + """ + logger.info(f"Updating rollout weights to version {param_version}") + + with self.sync_lock: + if self.sync_in_progress: + logger.warning(f"Sync already in progress, skipping version {param_version}") + return False + + self.sync_in_progress = True + + try: + # 暂停rollout - 带超时机制 + if not self.rollout_controller.pause(timeout=10.0): + logger.error("Failed to pause rollout within timeout") + return False + + # 等待当前generation完成(如果有的话) + time.sleep(0.1) + + # 执行参数同步 + sync_success = self._execute_parameter_sync(param_version) + + if sync_success: + self.current_param_version = param_version + self.param_sync_requests += 1 + self.last_sync_time = time.time() + logger.info(f"Successfully updated rollout weights to version {param_version}") + else: + logger.error(f"Failed to sync parameters to version {param_version}") + + except Exception as e: + logger.error(f"Error during parameter sync: {e}") + sync_success = False + finally: + # 恢复rollout + self.rollout_controller.resume() + self.sync_in_progress = False + + return sync_success + + def _execute_parameter_sync(self, param_version: int) -> bool: + """ + 执行实际的参数同步 - 改进的同步逻辑 + + Args: + param_version: 目标参数版本 + + Returns: + bool: 是否同步成功 + """ + try: + # 暂停推理引擎 + if self.async_rollout_mode and hasattr(self, "async_rollout_manager"): + # 对于异步模式,暂停服务器 + pass # 异步服务器的暂停在 pause() 中已经处理 + else: + # 对于同步模式,使用sleep/wake_up机制 + sleep_futures = self.rollout_wg.sleep() + ray.get(sleep_futures) + + # 执行参数同步 + if self.param_synchronizer: + self.param_synchronizer.sync_weights() + logger.debug("Parameter synchronization completed via synchronizer") + else: + # 直接使用rollout worker group的同步机制 + if hasattr(self.rollout_wg, "sync_rollout_weights"): + sync_futures = self.rollout_wg.sync_rollout_weights() + ray.get(sync_futures) + logger.debug("Parameter synchronization completed via rollout worker group") + else: + logger.warning("No parameter synchronization mechanism available") + return False + + # 恢复推理引擎 + if self.async_rollout_mode and hasattr(self, "async_rollout_manager"): + # 对于异步模式,恢复服务器 + pass # 异步服务器的恢复在 resume() 中已经处理 + else: + # 对于同步模式,唤醒workers + wake_futures = self.rollout_wg.wake_up() + ray.get(wake_futures) + + return True + + except Exception as e: + logger.error(f"Parameter sync execution failed: {e}") + return False + + def _create_dataloader(self): + """创建数据加载器""" + from torch.utils.data import DataLoader + + if self.train_dataset is None: + raise ValueError("Training dataset not provided") + + return DataLoader( + self.train_dataset, + batch_size=self.config.data.train_batch_size, + sampler=self.train_sampler, + collate_fn=self.collate_fn, + num_workers=self.config.data.get("dataloader_num_workers", 0), + drop_last=True, + pin_memory=True, # 改进内存管理 + ) + + def _create_continuous_iterator(self): + """创建连续的数据迭代器""" + dataloader = self._create_dataloader() + + epoch = 0 + while self.running: + try: + for batch_dict in dataloader: + if not self.running: + return + yield epoch, batch_dict + epoch += 1 + except Exception as e: + logger.error(f"Error in data iterator: {e}") + time.sleep(1.0) # 避免快速重试 + continue + + def _should_pause_generation(self) -> bool: + """ + 判断是否应该暂停生成,基于新鲜度控制 - 改进的判断逻辑 + """ + if self.message_queue_client is None: + return False + + try: + queue_stats = self.message_queue_client.get_statistics() + queue_size = queue_stats["queue_size"] + current_trainer_version = queue_stats["current_param_version"] + + # 计算参数版本差异 + version_diff = self.current_param_version - current_trainer_version + + # 如果版本差异过大,暂停生成 + if version_diff >= self.max_staleness_allowed: + logger.debug( + f"Should pause due to staleness: rollout_version={self.current_param_version}, " + f"trainer_version={current_trainer_version}, diff={version_diff}" + ) + return True + + # 如果队列太满,也暂停生成 + max_queue_size = self.freshness_threshold * self.config.data.train_batch_size + if queue_size >= max_queue_size: + logger.debug(f"Should pause due to full queue: size={queue_size}, max={max_queue_size}") + return True + + return False + + except Exception as e: + logger.error(f"Error checking pause conditions: {e}") + return True # 出错时暂停生成 + + def _generate_batch(self, epoch: int, batch_dict: dict) -> Optional[DataProto]: + """生成单个batch的样本 - 改进的生成逻辑""" + try: + batch = DataProto.from_single_dict(batch_dict) + + # 处理batch用于生成 - 参考OneStepOffRayTrainer的处理逻辑 + batch_keys_to_pop = ["input_ids", "attention_mask", "position_ids"] + non_tensor_batch_keys_to_pop = ["raw_prompt_ids"] + + # 处理多模态数据和其他可选字段 + optional_keys = ["multi_modal_data", "raw_prompt", "tools_kwargs", "interaction_kwargs"] + for key in optional_keys: + if key in batch.non_tensor_batch: + non_tensor_batch_keys_to_pop.append(key) + + gen_batch = batch.pop( + batch_keys=batch_keys_to_pop, + non_tensor_batch_keys=non_tensor_batch_keys_to_pop, + ) + + # 重复生成多个响应 - 参考OneStepOffRayTrainer + n_repeats = self.config.actor_rollout_ref.rollout.n + gen_batch = gen_batch.repeat(repeat_times=n_repeats, interleave=True) + + # 执行生成 + if self.async_rollout_mode: + # 异步生成 + gen_batch_output = ray.get( + self.rollout_wg.async_generate_sequences.remote(gen_batch), timeout=self.generation_timeout + ) + else: + # 同步生成 + gen_batch_output = ray.get( + self.rollout_wg.generate_sequences.remote(gen_batch), timeout=self.generation_timeout + ) + + # 添加UID - 确保每个样本有唯一标识 + batch.non_tensor_batch["uid"] = np.array([str(uuid.uuid4()) for _ in range(len(batch.batch))], dtype=object) + + # 重复原始batch以对齐生成的响应 + batch = batch.repeat(repeat_times=n_repeats, interleave=True) + + # 合并数据 + final_batch = batch.union(gen_batch_output) + + # 添加rollout metadata + final_batch.meta_info["rollout_param_version"] = self.current_param_version + final_batch.meta_info["generation_timestamp"] = time.time() + + return final_batch + + except Exception as e: + logger.error(f"Error generating batch: {e}") + self.generation_errors += 1 + return None + + def _generation_loop(self): + """主要的生成循环 - 改进的循环逻辑""" + logger.info("Starting generation loop...") + + try: + continuous_iterator = self._create_continuous_iterator() + + for epoch, batch_dict in continuous_iterator: + if not self.running: + break + + # 等待如果被暂停 + if not self.rollout_controller.wait_if_paused(timeout=1.0): + if not self.running: + break + continue + + # 检查是否应该暂停生成 + if self._should_pause_generation(): + time.sleep(self.batch_generation_interval) + continue + + # 生成样本 + timing_raw = {} + with marked_timer("generate_batch", timing_raw): + generated_batch = self._generate_batch(epoch, batch_dict) + + if generated_batch is not None: + # 准备rollout metadata + rollout_metadata = { + "timing": timing_raw, + "generation_timestamp": time.time(), + "rollout_param_version": self.current_param_version, + "epoch": epoch, + } + + # 放入队列 + success = self.message_queue_client.put_batch( + epoch=epoch, + batch=generated_batch, + param_version=self.current_param_version, + rollout_metadata=rollout_metadata, + ) + + if success: + self.total_generated_samples += 1 + if self.total_generated_samples % 10 == 0: + logger.info( + f"Generated {self.total_generated_samples} batches, " + f"param_version={self.current_param_version}, " + f"errors={self.generation_errors}" + ) + else: + self.dropped_stale_samples += 1 + if self.dropped_stale_samples % 5 == 0: + logger.warning(f"Dropped stale samples: {self.dropped_stale_samples}") + + # 控制生成频率 + if self.batch_generation_interval > 0: + time.sleep(self.batch_generation_interval) + + except Exception as e: + logger.error(f"Generation loop error: {e}") + finally: + logger.info("Generation loop finished") + + def fit(self): + """开始异步生成样本 - 改进的主运行逻辑""" + logger.info("Starting Rollouter...") + + if self.message_queue_client is None: + raise ValueError("MessageQueue client not set. Call set_message_queue_client() first.") + + self.running = True + + # 在单独的线程中运行生成循环 + self.generation_thread = threading.Thread(target=self._generation_loop, daemon=True) + self.generation_thread.start() + + logger.info("Rollouter started successfully") + + try: + # 主线程保持运行,处理控制信号和状态监控 + last_stats_time = time.time() + stats_interval = 30.0 # 30秒报告一次统计 + + while self.running: + time.sleep(1.0) + + # 定期打印统计信息 + current_time = time.time() + if current_time - last_stats_time >= stats_interval: + self._log_statistics() + last_stats_time = current_time + + # 检查生成线程状态 + if not self.generation_thread.is_alive(): + logger.error("Generation thread died, restarting...") + self.generation_thread = threading.Thread(target=self._generation_loop, daemon=True) + self.generation_thread.start() + + except KeyboardInterrupt: + logger.info("Received interrupt signal, shutting down...") + except Exception as e: + logger.error(f"Error in main loop: {e}") + finally: + self.shutdown() + + def _log_statistics(self): + """记录统计信息""" + try: + controller_status = self.rollout_controller.get_status() + queue_stats = self.message_queue_client.get_statistics() + + logger.info( + f"Rollouter stats - Generated: {self.total_generated_samples}, " + f"Dropped: {self.dropped_stale_samples}, " + f"Errors: {self.generation_errors}, " + f"Queue size: {queue_stats['queue_size']}, " + f"Param version: {self.current_param_version}, " + f"Paused: {controller_status['is_paused']}, " + f"Sync requests: {self.param_sync_requests}" + ) + except Exception as e: + logger.error(f"Error logging statistics: {e}") + + def shutdown(self): + """关闭Rollouter - 改进的关闭逻辑""" + logger.info("Shutting down Rollouter...") + + self.running = False + + # 恢复可能被暂停的生成线程 + self.rollout_controller.resume() + + # 等待生成线程结束 + if self.generation_thread and self.generation_thread.is_alive(): + logger.info("Waiting for generation thread to finish...") + self.generation_thread.join(timeout=10.0) + + if self.generation_thread.is_alive(): + logger.warning("Generation thread did not finish within timeout") + + # 关闭线程池 + if self.thread_executor: + self.thread_executor.shutdown(wait=True) + + # 清理异步rollout管理器 + if hasattr(self, "async_rollout_manager"): + try: + # TODO: 添加异步rollout管理器的清理逻辑 + pass + except Exception as e: + logger.warning(f"Error cleaning up async rollout manager: {e}") + + logger.info("Rollouter shutdown complete") + + def get_statistics(self) -> dict: + """获取统计信息 - 改进的统计信息""" + controller_status = self.rollout_controller.get_status() + + stats = { + "total_generated_samples": self.total_generated_samples, + "dropped_stale_samples": self.dropped_stale_samples, + "generation_errors": self.generation_errors, + "current_param_version": self.current_param_version, + "param_sync_requests": self.param_sync_requests, + "last_sync_time": self.last_sync_time, + "is_running": self.running, + "sync_in_progress": self.sync_in_progress, + } + + stats.update(controller_status) + + # 添加队列统计(如果可用) + if self.message_queue_client: + try: + queue_stats = self.message_queue_client.get_statistics() + stats["queue_size"] = queue_stats.get("queue_size", 0) + stats["queue_total_produced"] = queue_stats.get("total_produced", 0) + stats["queue_dropped_samples"] = queue_stats.get("dropped_samples", 0) + except Exception as e: + logger.debug(f"Error getting queue statistics: {e}") + + return stats diff --git a/recipe/fully_async_policy/rollouter.py b/recipe/fully_async_policy/rollouter.py deleted file mode 100644 index ac43b6e3dbf..00000000000 --- a/recipe/fully_async_policy/rollouter.py +++ /dev/null @@ -1,413 +0,0 @@ -# Copyright 2025 Meituan Ltd. and/or its affiliates -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import logging -import threading -import time -import uuid - -import numpy as np -import ray -from omegaconf import OmegaConf -from torch.utils.data import Dataset, Sampler - -from recipe.fully_async_policy.message_queue import MessageQueueClient -from verl import DataProto -from verl.single_controller.ray import RayClassWithInitArgs, RayWorkerGroup -from verl.single_controller.ray.base import create_colocated_worker_cls -from verl.trainer.ppo.ray_trainer import ResourcePoolManager, Role, WorkerType -from verl.utils.debug import marked_timer - -logger = logging.getLogger(__name__) - - -class RolloutController: - """控制rollout的暂停和恢复""" - - def __init__(self): - self.is_paused = False - self.pause_event = threading.Event() - self.resume_event = threading.Event() - self.resume_event.set() # 初始状态为可运行 - self.pending_requests = [] - self.lock = threading.RLock() - - def pause(self): - """暂停rollout""" - with self.lock: - if not self.is_paused: - self.is_paused = True - self.resume_event.clear() - self.pause_event.set() - logger.info("Rollout paused") - - def resume(self): - """恢复rollout""" - with self.lock: - if self.is_paused: - self.is_paused = False - self.pause_event.clear() - self.resume_event.set() - logger.info("Rollout resumed") - - def wait_if_paused(self, timeout: float = None): - """如果被暂停则等待恢复""" - if self.is_paused: - self.resume_event.wait(timeout) - - def is_pause_requested(self) -> bool: - """检查是否有暂停请求""" - return self.pause_event.is_set() - - -class Rollouter: - """ - 异步样本生成器,负责持续生成训练样本并放入MessageQueue - """ - - def __init__( - self, - config, - tokenizer, - role_worker_mapping: dict[Role, WorkerType], - resource_pool_manager: ResourcePoolManager, - ray_worker_group_cls: RayWorkerGroup = RayWorkerGroup, - processor=None, - train_dataset: Dataset | None = None, - collate_fn=None, - train_sampler: Sampler | None = None, - device_name="cuda", - ): - self.config = config - self.tokenizer = tokenizer - self.processor = processor - self.role_worker_mapping = role_worker_mapping - self.resource_pool_manager = resource_pool_manager - self.ray_worker_group_cls = ray_worker_group_cls - self.device_name = device_name - - # 数据相关 - self.train_dataset = train_dataset - self.collate_fn = collate_fn - self.train_sampler = train_sampler - - # Rollout控制 - self.rollout_controller = RolloutController() - self.current_param_version = 0 - - # 新鲜度控制 - self.freshness_threshold = config.async_training.get("freshness_threshold", 3) - self.max_staleness_allowed = config.async_training.get("max_staleness_allowed", 5) - - # 统计信息 - self.total_generated_samples = 0 - self.dropped_stale_samples = 0 - self.pause_count = 0 - - # Worker groups - self.rollout_wg = None - self.message_queue_client = None - - # 运行状态 - self.running = False - self.generation_thread = None - - def init_workers(self): - """初始化rollout workers""" - logger.info("Initializing Rollouter workers...") - - self.resource_pool_manager.create_resource_pool() - self.resource_pool_to_cls = {pool: {} for pool in self.resource_pool_manager.resource_pool_dict.values()} - - # 只创建rollout worker - resource_pool = self.resource_pool_manager.get_resource_pool(Role.Rollout) - role_cls = RayClassWithInitArgs( - cls=self.role_worker_mapping[Role.Rollout], - config=self.config.actor_rollout_ref, - role="rollout", - ) - self.resource_pool_to_cls[resource_pool]["rollout"] = role_cls - - # 初始化WorkerGroup - all_wg = {} - wg_kwargs = {} - if OmegaConf.select(self.config.trainer, "ray_wait_register_center_timeout") is not None: - wg_kwargs["ray_wait_register_center_timeout"] = self.config.trainer.ray_wait_register_center_timeout - - for resource_pool, class_dict in self.resource_pool_to_cls.items(): - worker_dict_cls = create_colocated_worker_cls(class_dict=class_dict) - wg_dict = self.ray_worker_group_cls( - resource_pool=resource_pool, - ray_cls_with_init=worker_dict_cls, - device_name=self.device_name, - **wg_kwargs, - ) - spawn_wg = wg_dict.spawn(prefix_set=class_dict.keys()) - all_wg.update(spawn_wg) - - self.rollout_wg = all_wg["rollout"] - self.rollout_wg.init_model() - logger.info("Rollouter workers initialized successfully") - - def set_message_queue_client(self, message_queue_client: MessageQueueClient): - """设置消息队列客户端""" - self.message_queue_client = message_queue_client - - def update_rollout_weights(self, param_version: int): - """ - 更新rollout模型参数 - 这个方法由外部Trainer调用 - """ - logger.info(f"Updating rollout weights to version {param_version}") - - # 暂停rollout - self.rollout_controller.pause() - - try: - # 暂停推理引擎 - ray.get(self.rollout_wg.sleep.remote()) - - # 执行参数同步 - # 这里需要与actor建立同步机制 - if hasattr(self, "param_synchronizer") and self.param_synchronizer: - self.param_synchronizer.sync_weights() - else: - logger.warning("Parameter synchronizer not available, skipping weight sync") - - # 更新参数版本 - self.current_param_version = param_version - - # 恢复推理引擎 - ray.get(self.rollout_wg.wake_up.remote()) - - finally: - # 恢复rollout - self.rollout_controller.resume() - - logger.info(f"Rollout weights updated to version {param_version}") - - def set_parameter_synchronizer(self, param_synchronizer): - """设置参数同步器""" - self.param_synchronizer = param_synchronizer - - def _create_dataloader(self): - """创建数据加载器""" - from torch.utils.data import DataLoader - - return DataLoader( - self.train_dataset, - batch_size=self.config.data.train_batch_size, - sampler=self.train_sampler, - collate_fn=self.collate_fn, - num_workers=self.config.data.get("dataloader_num_workers", 0), - drop_last=True, - ) - - def _create_continuous_iterator(self): - """创建连续的数据迭代器""" - dataloader = self._create_dataloader() - - for epoch in range(self.config.trainer.total_epochs): - for batch_dict in dataloader: - yield epoch, batch_dict - - def _should_pause_generation(self) -> bool: - """ - 判断是否应该暂停生成,基于新鲜度控制 - """ - if self.message_queue_client is None: - return False - - queue_stats = self.message_queue_client.get_statistics() - queue_size = queue_stats["queue_size"] - current_trainer_version = queue_stats["current_param_version"] - - # 计算参数版本差异 - version_diff = self.current_param_version - current_trainer_version - - # 如果版本差异过大,暂停生成 - if version_diff >= self.max_staleness_allowed: - logger.info( - f"Pausing generation due to staleness: rollout_version={self.current_param_version}, " - f"trainer_version={current_trainer_version}, diff={version_diff}" - ) - return True - - # 如果队列太满,也暂停生成 - max_queue_size = self.freshness_threshold * self.config.data.train_batch_size - if queue_size >= max_queue_size: - logger.info(f"Pausing generation due to full queue: size={queue_size}, max={max_queue_size}") - return True - - return False - - def _generate_batch(self, epoch: int, batch_dict: dict) -> DataProto | None: - """生成单个batch的样本""" - try: - batch = DataProto.from_single_dict(batch_dict) - - # 处理batch用于生成 - batch_keys_to_pop = ["input_ids", "attention_mask", "position_ids"] - non_tensor_batch_keys_to_pop = ["raw_prompt_ids"] - - # 处理多模态数据 - if "multi_modal_data" in batch.non_tensor_batch: - non_tensor_batch_keys_to_pop.append("multi_modal_data") - if "raw_prompt" in batch.non_tensor_batch: - non_tensor_batch_keys_to_pop.append("raw_prompt") - if "tools_kwargs" in batch.non_tensor_batch: - non_tensor_batch_keys_to_pop.append("tools_kwargs") - if "interaction_kwargs" in batch.non_tensor_batch: - non_tensor_batch_keys_to_pop.append("interaction_kwargs") - - gen_batch = batch.pop( - batch_keys=batch_keys_to_pop, - non_tensor_batch_keys=non_tensor_batch_keys_to_pop, - ) - - # 重复生成多个响应 - gen_batch = gen_batch.repeat(repeat_times=self.config.actor_rollout_ref.rollout.n, interleave=True) - - # 执行生成 - if self.config.actor_rollout_ref.rollout.mode == "async": - gen_batch_output = ray.get(self.rollout_wg.async_generate_sequences.remote(gen_batch)) - else: - gen_batch_output = ray.get(self.rollout_wg.generate_sequences.remote(gen_batch)) - - # 添加UID - batch.non_tensor_batch["uid"] = np.array([str(uuid.uuid4()) for _ in range(len(batch.batch))], dtype=object) - - # 重复原始batch以对齐生成的响应 - batch = batch.repeat(repeat_times=self.config.actor_rollout_ref.rollout.n, interleave=True) - - # 合并数据 - final_batch = batch.union(gen_batch_output) - - return final_batch - - except Exception as e: - logger.error(f"Error generating batch: {e}") - return None - - def _generation_loop(self): - """主要的生成循环""" - logger.info("Starting generation loop...") - - continuous_iterator = self._create_continuous_iterator() - - for epoch, batch_dict in continuous_iterator: - if not self.running: - break - - # 等待如果被暂停 - self.rollout_controller.wait_if_paused(timeout=1.0) - - if not self.running: - break - - # 检查是否应该暂停生成 - if self._should_pause_generation(): - time.sleep(1.0) # 等待一段时间再检查 - continue - - # 生成样本 - timing_raw = {} - with marked_timer("generate_batch", timing_raw): - generated_batch = self._generate_batch(epoch, batch_dict) - - if generated_batch is not None: - # 放入队列 - rollout_metadata = { - "timing": timing_raw, - "generation_timestamp": time.time(), - } - - success = self.message_queue_client.put_batch( - epoch=epoch, - batch=generated_batch, - param_version=self.current_param_version, - rollout_metadata=rollout_metadata, - ) - - if success: - self.total_generated_samples += 1 - if self.total_generated_samples % 10 == 0: - logger.info( - f"Generated {self.total_generated_samples} batches, " - f"param_version={self.current_param_version}" - ) - else: - self.dropped_stale_samples += 1 - logger.warning(f"Dropped stale sample, total dropped: {self.dropped_stale_samples}") - - logger.info("Generation loop finished") - - def fit(self): - """开始异步生成样本""" - logger.info("Starting Rollouter...") - - if self.message_queue_client is None: - raise ValueError("MessageQueue client not set. Call set_message_queue_client() first.") - - self.running = True - - # 在单独的线程中运行生成循环 - self.generation_thread = threading.Thread(target=self._generation_loop, daemon=True) - self.generation_thread.start() - - try: - # 主线程保持运行,处理控制信号 - while self.running: - time.sleep(1.0) - - # 定期打印统计信息 - if self.total_generated_samples > 0 and self.total_generated_samples % 100 == 0: - queue_stats = self.message_queue_client.get_statistics() - logger.info( - f"Rollouter stats - Generated: {self.total_generated_samples}, " - f"Dropped: {self.dropped_stale_samples}, " - f"Queue size: {queue_stats['queue_size']}, " - f"Param version: {self.current_param_version}" - ) - - except KeyboardInterrupt: - logger.info("Received interrupt signal, shutting down...") - finally: - self.shutdown() - - def shutdown(self): - """关闭Rollouter""" - logger.info("Shutting down Rollouter...") - - self.running = False - - # 恢复可能被暂停的生成线程 - self.rollout_controller.resume() - - # 等待生成线程结束 - if self.generation_thread and self.generation_thread.is_alive(): - self.generation_thread.join(timeout=5.0) - - logger.info("Rollouter shutdown complete") - - def get_statistics(self) -> dict: - """获取统计信息""" - return { - "total_generated_samples": self.total_generated_samples, - "dropped_stale_samples": self.dropped_stale_samples, - "current_param_version": self.current_param_version, - "pause_count": self.pause_count, - "is_running": self.running, - "is_paused": self.rollout_controller.is_paused, - } From 2df18111a7b2e66e289c6cea94389c8d2f677568 Mon Sep 17 00:00:00 2001 From: ArronHZG Date: Wed, 30 Jul 2025 21:35:53 +0800 Subject: [PATCH 006/182] cpu mq --- recipe/fully_async_policy/message_queue.py | 60 ++++++++++++++++++++-- recipe/fully_async_policy/test_mq.py | 1 + 2 files changed, 57 insertions(+), 4 deletions(-) diff --git a/recipe/fully_async_policy/message_queue.py b/recipe/fully_async_policy/message_queue.py index f57d1e15325..58996d4266e 100644 --- a/recipe/fully_async_policy/message_queue.py +++ b/recipe/fully_async_policy/message_queue.py @@ -12,6 +12,7 @@ # See the License for the specific language governing permissions and # limitations under the License. +import logging import threading import time import uuid @@ -24,6 +25,8 @@ from filelock import FileLock from omegaconf import DictConfig +logger = logging.getLogger(__name__) + @dataclass class BatchSample: @@ -40,7 +43,8 @@ class BatchSample: @ray.remote(num_cpus=1) class MessageQueue: """ - 基于ZeroMQ的异步消息队列,用于Rollouter和Trainer之间的通信 + 简化的Ray-based异步消息队列,用于Rollouter和Trainer之间的通信 + 去掉了ZeroMQ的复杂性,使用更可靠的Ray机制 """ def __init__(self, config: DictConfig, max_queue_size: int = 1000): @@ -49,7 +53,7 @@ def __init__(self, config: DictConfig, max_queue_size: int = 1000): self.queue = deque(maxlen=max_queue_size) self.current_param_version = 0 - # 安全地获取配置值,避免递归问题 + # 安全地获取配置值 try: if hasattr(config, "async_training") and config.async_training is not None: self.freshness_threshold = getattr(config.async_training, "freshness_threshold", 3) @@ -69,15 +73,22 @@ def __init__(self, config: DictConfig, max_queue_size: int = 1000): # Threading for message handling self.running = True + + # 线程安全 self.lock = threading.RLock() self.consumer_waiting = False self.consumer_condition = threading.Condition(self.lock) - # Statistics + # 统计信息 self.total_produced = 0 self.total_consumed = 0 self.dropped_samples = 0 + logger.info( + f"MessageQueue initialized with max_queue_size={max_queue_size}," + "freshness_threshold={self.freshness_threshold}" + ) + def _setup_zmq(self): """设置ZeroMQ socket""" with FileLock("/tmp/verl_message_queue.lock"): @@ -113,6 +124,7 @@ def put_batch(self, epoch: int, batch: Any, param_version: int, rollout_metadata staleness = self.current_param_version - param_version if staleness >= self.freshness_threshold: self.dropped_samples += 1 + logger.debug(f"Dropped stale sample: staleness={staleness}, threshold={self.freshness_threshold}") return False sample = BatchSample( @@ -128,7 +140,7 @@ def put_batch(self, epoch: int, batch: Any, param_version: int, rollout_metadata if len(self.queue) >= self.max_queue_size: removed = self.queue.popleft() self.dropped_samples += 1 - print(f"Queue full, dropped sample {removed.batch_id}") + logger.warning(f"Queue full, dropped sample {removed.batch_id}") self.queue.append(sample) self.total_produced += 1 @@ -137,6 +149,9 @@ def put_batch(self, epoch: int, batch: Any, param_version: int, rollout_metadata if self.consumer_waiting: self.consumer_condition.notify() + if self.total_produced % 100 == 0: + logger.debug(f"MessageQueue stats: produced={self.total_produced}, queue_size={len(self.queue)}") + return True def get_batch(self, min_batch_count: int = 1, timeout: float = 30.0) -> Optional[list[BatchSample]]: @@ -174,7 +189,9 @@ def get_batch(self, min_batch_count: int = 1, timeout: float = 30.0) -> Optional def update_param_version(self, version: int): """更新当前参数版本""" with self.lock: + old_version = self.current_param_version self.current_param_version = version + logger.debug(f"Parameter version updated from {old_version} to {version}") def get_queue_size(self) -> int: """获取当前队列长度""" @@ -191,12 +208,15 @@ def get_statistics(self) -> dict[str, Any]: "dropped_samples": self.dropped_samples, "current_param_version": self.current_param_version, "freshness_threshold": self.freshness_threshold, + "max_queue_size": self.max_queue_size, } def clear_queue(self): """清空队列""" with self.lock: + cleared_count = len(self.queue) self.queue.clear() + logger.info(f"Cleared {cleared_count} samples from queue") def shutdown(self): """关闭消息队列""" @@ -206,6 +226,34 @@ def shutdown(self): if self.context: self.context.term() + def get_memory_usage(self) -> dict: + """获取内存使用统计""" + with self.lock: + # 估算队列中样本的内存使用 + import sys + + total_size = 0 + sample_count = len(self.queue) + + if sample_count > 0: + # 估算单个样本的大小(简化估算) + sample = list(self.queue)[0] + try: + sample_size = sys.getsizeof(sample) + if hasattr(sample.data, "batch") and hasattr(sample.data.batch, "__len__"): + # 如果有batch信息,估算数据大小 + batch_size = len(sample.data.batch) + sample_size += batch_size * 1000 # 粗略估算每个batch条目1KB + total_size = sample_size * sample_count + except Exception: + total_size = sample_count * 10000 # 粗略估算每个样本10KB + + return { + "queue_samples": sample_count, + "estimated_memory_bytes": total_size, + "estimated_memory_mb": total_size / (1024 * 1024), + } + def get_address(self) -> str: """获取ZeroMQ地址""" return self.address @@ -244,3 +292,7 @@ def clear_queue(self): def shutdown(self): """关闭队列""" ray.get(self.queue_actor.shutdown.remote()) + + def get_memory_usage(self) -> dict: + """获取内存使用统计""" + return ray.get(self.queue_actor.get_memory_usage.remote()) diff --git a/recipe/fully_async_policy/test_mq.py b/recipe/fully_async_policy/test_mq.py index 488b7d12614..3659911319e 100644 --- a/recipe/fully_async_policy/test_mq.py +++ b/recipe/fully_async_policy/test_mq.py @@ -227,6 +227,7 @@ def test_get_statistics(self, message_queue_actor): "dropped_samples", "current_param_version", "freshness_threshold", + "max_queue_size", } assert set(stats.keys()) == expected_keys assert isinstance(stats["queue_size"], int) From 48e91a3d2471457510dffa5a79cf9f00f25976d1 Mon Sep 17 00:00:00 2001 From: ArronHZG Date: Wed, 30 Jul 2025 21:50:29 +0800 Subject: [PATCH 007/182] one_step_off_policy --- recipe/one_step_off_policy/main_ppo.py | 51 +++++--------------------- 1 file changed, 9 insertions(+), 42 deletions(-) diff --git a/recipe/one_step_off_policy/main_ppo.py b/recipe/one_step_off_policy/main_ppo.py index 44a0f4b8675..d6072c5521e 100644 --- a/recipe/one_step_off_policy/main_ppo.py +++ b/recipe/one_step_off_policy/main_ppo.py @@ -23,58 +23,18 @@ import ray from omegaconf import OmegaConf -from verl.trainer.constants_ppo import get_ppo_ray_runtime_env from verl.trainer.main_ppo import create_rl_dataset, create_rl_sampler from verl.trainer.ppo.reward import load_reward_manager from .ray_trainer import OneStepOffRayTrainer -@hydra.main(config_path="config", config_name="one_step_off_ppo_trainer", version_base=None) -def main(config): - run_ppo(config) - - -# Define a function to run the PPO-like training process -def run_ppo(config) -> None: - # Check if Ray is not initialized - if not ray.is_initialized(): - # Initialize Ray with a local cluster configuration - # Set environment variables in the runtime environment to control tokenizer parallelism, - # NCCL debug level, VLLM logging level, and allow runtime LoRA updating - # `num_cpus` specifies the number of CPU cores Ray can use, obtained from the configuration - ray.init( - runtime_env=get_ppo_ray_runtime_env(), - num_cpus=config.ray_init.num_cpus, - ) - - # Create a remote instance of the TaskRunner class, and - # Execute the `run` method of the TaskRunner instance remotely and wait for it to complete - if ( - OmegaConf.select(config.trainer, "profile_steps") is not None - and len(OmegaConf.select(config.trainer, "profile_steps")) > 0 - ): - nsight_options = OmegaConf.to_container(config.trainer.controller_nsight_options) - runner = TaskRunner.options(runtime_env={"nsight": nsight_options}).remote() - else: - runner = TaskRunner.remote() - ray.get(runner.run.remote(config)) - - # [Optional] get the path of the timeline trace file from the configuration, default to None - # This file is used for performance analysis - timeline_json_file = config.ray_init.get("timeline_json_file", None) - if timeline_json_file: - ray.timeline(filename=timeline_json_file) - - @ray.remote(num_cpus=1) # please make sure main_task is not scheduled on head -class TaskRunner: +class OneStepOffTaskRunner: def run(self, config): # Print the initial configuration. `resolve=True` will evaluate symbolic values. from pprint import pprint - from omegaconf import OmegaConf - from verl.utils.fs import copy_to_local print(f"TaskRunner hostname: {socket.gethostname()}, PID: {os.getpid()}") @@ -172,7 +132,7 @@ def run(self, config): # finally, we combine all the rewards together # The reward type depends on the tag of the data if config.reward_model.enable: - if config.reward_model.strategy in ["fsdp2"]: + if config.reward_model.strategy == "fsdp2": from verl.workers.fsdp_workers import RewardModelWorker elif config.reward_model.strategy == "megatron": from verl.workers.megatron_workers import RewardModelWorker @@ -224,5 +184,12 @@ def run(self, config): trainer.fit() +@hydra.main(config_path="config", config_name="one_step_off_ppo_trainer", version_base=None) +def main(config): + from verl.trainer.main_ppo import run_ppo + + run_ppo(config) + + if __name__ == "__main__": main() From 07f2e62de973fc12bcd68b0f91cbf69622d580d8 Mon Sep 17 00:00:00 2001 From: ArronHZG Date: Wed, 30 Jul 2025 21:51:04 +0800 Subject: [PATCH 008/182] md --- .../fully_async_policy/README_fully_async.md | 381 ++++++++++++------ 1 file changed, 253 insertions(+), 128 deletions(-) diff --git a/recipe/fully_async_policy/README_fully_async.md b/recipe/fully_async_policy/README_fully_async.md index 979f9aff783..4c1866788a5 100644 --- a/recipe/fully_async_policy/README_fully_async.md +++ b/recipe/fully_async_policy/README_fully_async.md @@ -1,183 +1,308 @@ -# 完全异步训练工作流 (Fully Async Training Workflow) +# 完全异步PPO训练系统 (Fully Async Policy) -## 概述 +本文档介绍了基于 OneStepOffRayTrainer 成熟实现改进的完全异步PPO训练系统,该系统实现了 Trainer 和 Rollouter 的完全解耦,支持异步样本生成和训练。 -本项目实现了基于现有 one step off policy 代码的完全异步训练工作流,将样本生成(Rollouter)和模型训练(Trainer)完全解耦,通过 MessageQueue 进行异步通信。 +## 🚀 **系统特性** -## 架构设计 +### 核心特性 +- **完全异步训练**: Trainer 和 Rollouter 在独立的Ray Actor中运行,实现真正的并行处理 +- **智能新鲜度控制**: 基于参数版本和时间戳的样本新鲜度管理,防止过期样本影响训练 +- **健壮的参数同步**: 改进的参数同步机制,支持错误重试和状态管理 +- **简化的消息队列**: 去除ZeroMQ依赖,使用Ray-based消息传递,更稳定可靠 +- **完善的监控**: 详细的性能指标和组件健康状态监控 -### 核心组件 +### 改进亮点 +- **参考OneStepOffRayTrainer**: 使用成熟的训练逻辑,确保训练稳定性 +- **错误处理和恢复**: 完善的异常处理和资源清理机制 +- **组件协调**: 统一的组件生命周期管理和状态监控 +- **配置验证**: 智能的配置验证和默认值设置 -1. **MessageQueue**: 基于 ZeroMQ 的异步消息队列,作为 Ray Actor 存在 - - 管理生成的样本队列 - - 支持新鲜度控制,自动丢弃过期样本 - - 提供线程安全的生产者-消费者接口 +## 🏗️ **系统架构** -2. **Rollouter**: 专门负责样本生成的组件 - - 持续循环生成训练样本 - - 支持暂停/恢复机制,用于参数更新 - - 实现新鲜度阈值控制,避免生成过多过期样本 +### 组件结构 -3. **FullyAsyncTrainer**: 修改后的训练器 - - 从 MessageQueue 获取样本进行训练 - - 训练完成后通知 Rollouter 更新参数 - - 支持样本新鲜度监控和统计 - -4. **ParameterSynchronizer**: 参数同步模块 - - 基于 NCCL 实现高效的参数同步 - - 支持 Actor 到 Rollout 的参数传递 +``` +┌─────────────────┐ ┌─────────────────┐ ┌─────────────────┐ +│ FullyAsyncMain │────│ MessageQueue │────│ FullyAsyncTrainer│ +│ (Coordinator) │ │ (Ray Actor) │ │ (Ray Actor) │ +└─────────────────┘ └─────────────────┘ └─────────────────┘ + │ │ │ + └───────────────────────┼───────────────────────┘ + │ + ┌─────────────────┐ + │ Rollouter │ + │ (Ray Actor) │ + └─────────────────┘ + │ + ┌─────────────────┐ + │ ParameterSync │ + │ Manager │ + └─────────────────┘ +``` -### 工作流程 +### 数据流 ``` -┌─────────────┐ put_batch ┌──────────────┐ get_batch ┌─────────────┐ -│ Rollouter │ ──────────────► │ MessageQueue │ ──────────────► │ Trainer │ -│ │ │ │ │ │ -│ - 生成样本 │ │ - 队列管理 │ │ - 模型训练 │ -│ - 暂停/恢复 │ │ - 新鲜度控制 │ │ - 参数更新 │ -│ - 新鲜度控制 │ │ - 统计信息 │ │ - 同步通知 │ -└─────────────┘ └──────────────┘ └─────────────┘ - ▲ │ - │ update_rollout_weights │ - └─────────────────────────────────────────────────────────────────┘ +1. 数据生成: Rollouter → MessageQueue +2. 训练消费: MessageQueue → FullyAsyncTrainer +3. 参数同步: FullyAsyncTrainer → Rollouter +4. 状态监控: FullyAsyncMain → All Components ``` -## 新鲜度控制机制 +## 📋 **核心组件** + +### 1. FullyAsyncTrainer +- **功能**: 从MessageQueue获取样本进行异步训练 +- **特性**: + - 基于OneStepOffRayTrainer的成熟训练逻辑 + - 智能的样本新鲜度指标计算 + - 完善的错误处理和重试机制 + - 详细的训练性能监控 + +### 2. Rollouter +- **功能**: 持续生成训练样本并放入MessageQueue +- **特性**: + - 智能的暂停/恢复控制机制 + - 基于新鲜度的生成控制 + - 改进的参数同步处理 + - 异步/同步生成模式支持 + +### 3. MessageQueue +- **功能**: Ray-based消息队列,管理样本传递 +- **特性**: + - 去除ZeroMQ依赖,更稳定可靠 + - 智能的样本过期检测 + - 线程安全的队列操作 + - 内存使用监控 + +### 4. ParameterSynchronizer +- **功能**: 管理Actor和Rollout间的参数同步 +- **特性**: + - 支持错误重试和超时处理 + - 详细的同步状态跟踪 + - 集群通信组管理 + +### 5. FullyAsyncMain +- **功能**: 系统协调器,管理所有组件的生命周期 +- **特性**: + - 统一的组件初始化和清理 + - 实时的健康状态监控 + - 优雅的关闭和错误恢复 + +## ⚙️ **配置说明** + +### 异步训练配置 (async_training) -### 配置参数 +```yaml +async_training: + # 新鲜度控制 + freshness_threshold: 3 # 样本新鲜度阈值 + max_staleness_allowed: 5 # 最大允许的样本陈旧度 + + # 队列管理 + max_queue_size: 1000 # 消息队列最大大小 + min_batch_count: 1 # 每次获取的最小batch数量 + batch_timeout: 30.0 # 获取batch的超时时间 + + # 生成控制 + generation_timeout: 30.0 # 单次生成的超时时间 + batch_generation_interval: 0.1 # batch生成间隔 + + # 参数同步 + max_sync_retries: 3 # 参数同步最大重试次数 + sync_timeout: 30.0 # 同步超时时间 + sync_retry_delay: 1.0 # 重试延迟时间 +``` -- `freshness_threshold`: 新鲜度阈值,队列中超过此版本差异的样本会被丢弃 -- `max_staleness_allowed`: 最大允许的新鲜度差异,Rollouter 会暂停生成 -- `max_queue_size`: MessageQueue 的最大队列大小 +### 资源配置 -### 控制逻辑 +```yaml +trainer: + n_gpus_per_node: 4 # 每个训练节点的GPU数量 + nnodes: 2 # 训练节点数量 + device: cuda + +rollout: + n_gpus_per_node: 2 # 每个rollout节点的GPU数量 + nnodes: 1 # rollout节点数量 +``` -1. **样本丢弃**: 当样本的参数版本与当前 Trainer 版本差异超过 `freshness_threshold` 时,样本被丢弃 -2. **生成暂停**: 当 Rollouter 的参数版本与 Trainer 版本差异超过 `max_staleness_allowed` 时,暂停生成 -3. **队列管理**: 队列长度限制为 `freshness_threshold * batch_size`,避免内存溢出 +## 🔧 **使用方法** -## 性能优势 +### 1. 基本运行 -### 相比同步训练 +```bash +# 使用默认配置运行 +python fully_async_main.py -- **GPU 利用率提升**: 生成和训练并行进行,减少 GPU 空闲时间 -- **长尾样本优化**: 训练不需要等待最慢的样本生成完成 -- **资源隔离**: 可以独立配置生成和训练的资源分配 +# 使用自定义配置 +python fully_async_main.py --config-path /path/to/config --config-name my_config +``` -### 相比 One Step Off Policy +### 2. 配置自定义 -- **更高的异步度**: 完全解耦生成和训练,支持多步异步 -- **更灵活的控制**: 支持动态的新鲜度控制和队列管理 -- **更好的监控**: 提供详细的统计信息和性能指标 +```python +# 在配置文件中自定义异步训练参数 +async_training: + freshness_threshold: 5 + max_queue_size: 2000 + generation_timeout: 60.0 +``` -## 使用方法 +### 3. 监控和调试 -### 1. 安装依赖 +```python +# 系统会自动输出详细的统计信息 +# 包括: Trainer状态、Rollouter状态、队列状态等 -```bash -pip install zmq filelock +# 日志文件: fully_async_training.log +# 包含所有组件的详细日志信息 ``` -### 2. 配置文件 +## 📊 **性能监控** -使用 `config/fully_async_ppo_trainer.yaml` 配置文件,关键配置项: +### 关键指标 -```yaml -async_training: - freshness_threshold: 3 # 新鲜度阈值 - max_staleness_allowed: 5 # 最大允许新鲜度差异 - max_queue_size: 1000 # 队列最大大小 - min_batch_count: 1 # 最小batch数量 - batch_timeout: 30.0 # 获取batch超时时间 - -actor_rollout_ref: - rollout: - mode: async # 使用异步模式 - n_gpus: 4 # rollout专用GPU数量 - name: vllm # 使用vLLM引擎 -``` +#### Trainer指标 +- `global_steps`: 训练步数 +- `processed_samples`: 已处理样本数 +- `current_param_version`: 当前参数版本 +- `param_sync_count`: 参数同步次数 -### 3. 启动训练 +#### Rollouter指标 +- `total_generated_samples`: 总生成样本数 +- `dropped_stale_samples`: 丢弃的过期样本数 +- `generation_errors`: 生成错误数 +- `param_sync_requests`: 参数同步请求数 -```bash -python -m recipe.one_step_off_policy.fully_async_main \ - data.train_files=~/data/train.parquet \ - data.val_files=~/data/val.parquet \ - actor_rollout_ref.model.path=Qwen/Qwen2-7B-Instruct \ - trainer.total_training_steps=1000 -``` +#### 新鲜度指标 +- `avg_sample_age`: 样本平均年龄 +- `max_sample_age`: 样本最大年龄 +- `stale_samples_ratio`: 过期样本比例 -### 4. 监控训练 +#### 队列指标 +- `queue_size`: 当前队列大小 +- `total_produced`: 总生产样本数 +- `total_consumed`: 总消费样本数 +- `dropped_samples`: 总丢弃样本数 -训练过程中会输出以下统计信息: +## 🔍 **故障排查** -- `queue_size`: 当前队列大小 -- `avg_sample_age`: 平均样本年龄(参数版本差异) -- `max_sample_age`: 最大样本年龄 -- `param_version`: 当前参数版本 -- `processed_samples`: 已处理样本数 -- `dropped_samples`: 丢弃的过期样本数 +### 常见问题 -## 性能调优建议 +1. **样本生成过慢** + - 检查 `generation_timeout` 设置 + - 监控 `generation_errors` 指标 + - 调整 `batch_generation_interval` -### 1. 资源分配 +2. **样本过期严重** + - 调整 `freshness_threshold` + - 检查参数同步频率 + - 监控 `stale_samples_ratio` -- **生成资源**: 根据模型大小和生成速度需求分配 GPU -- **训练资源**: 根据batch大小和训练复杂度分配 GPU -- **比例建议**: 生成:训练 = 1:2 到 1:3 +3. **队列溢出** + - 增加 `max_queue_size` + - 优化训练速度 + - 调整 `min_batch_count` -### 2. 新鲜度控制 +4. **参数同步失败** + - 检查 `sync_timeout` 设置 + - 监控 `sync_failures` 指标 + - 调整 `max_sync_retries` -- **快速生成场景**: 降低 `freshness_threshold` (2-3) -- **慢速生成场景**: 提高 `freshness_threshold` (5-8) -- **队列大小**: 设置为 `freshness_threshold * batch_size * 2` +### 日志分析 -### 3. 网络优化 +```bash +# 查看主要错误 +grep "ERROR" fully_async_training.log -- **单节点**: MessageQueue 使用 IPC 协议 -- **多节点**: MessageQueue 使用 TCP 协议,注意网络带宽 +# 查看组件统计 +grep "Component Statistics" fully_async_training.log -## 故障排除 +# 查看参数同步状态 +grep "Parameter sync" fully_async_training.log +``` -### 常见问题 +## 🚀 **性能优化建议** -1. **队列为空**: 检查 Rollouter 是否正常运行,是否被新鲜度控制暂停 -2. **内存溢出**: 减少 `max_queue_size` 或增加 `freshness_threshold` -3. **参数同步失败**: 检查 NCCL 配置和网络连接 -4. **性能下降**: 调整资源分配比例,监控 GPU 利用率 +### 1. 资源配置优化 +- 根据模型大小合理配置GPU数量 +- 训练和rollout使用独立的资源池 +- 考虑内存和计算的平衡 -### 调试模式 +### 2. 新鲜度控制优化 +- 根据模型收敛速度调整新鲜度阈值 +- 监控样本年龄分布,避免过度丢弃 +- 动态调整队列大小 -设置环境变量启用详细日志: +### 3. 参数同步优化 +- 合理设置同步频率,平衡性能和一致性 +- 使用异步同步减少等待时间 +- 监控同步耗时,及时发现问题 -```bash -export VERL_LOGGING_LEVEL=DEBUG -export NCCL_DEBUG=INFO +## 🔧 **扩展和定制** + +### 自定义组件 + +```python +# 自定义Trainer +class CustomFullyAsyncTrainer(FullyAsyncTrainer): + def _compute_custom_metrics(self, batch): + # 添加自定义指标计算 + pass + +# 自定义Rollouter +class CustomRollouter(Rollouter): + def _custom_generation_logic(self, batch): + # 添加自定义生成逻辑 + pass +``` + +### 自定义监控 + +```python +# 添加自定义监控指标 +def custom_monitor(trainer_stats, rollouter_stats): + # 实现自定义监控逻辑 + custom_metric = calculate_custom_metric(trainer_stats) + logger.info(f"Custom metric: {custom_metric}") ``` -## 与现有系统对比 +## 📚 **与OneStepOffRayTrainer的对比** + +| 特性 | OneStepOffRayTrainer | FullyAsyncTrainer | +|------|---------------------|------------------| +| 训练模式 | 同步批处理 | 异步流处理 | +| 参数更新 | 批次同步更新 | 实时异步更新 | +| 资源利用 | 阶段性利用 | 持续高效利用 | +| 新鲜度控制 | 无需考虑 | 智能控制 | +| 复杂度 | 相对简单 | 更复杂但更灵活 | +| 适用场景 | 标准训练 | 大规模持续训练 | + +## 📖 **最佳实践** -| 特性 | 同步训练 | One Step Off | 完全异步 | -|------|----------|--------------|----------| -| 异步程度 | 无 | 一步 | 多步 | -| 资源利用率 | 低 | 中 | 高 | -| 实现复杂度 | 低 | 中 | 高 | -| 样本新鲜度 | 最新 | 一步延迟 | 可控延迟 | -| 内存使用 | 低 | 中 | 中-高 | +1. **配置调优**: 从默认配置开始,根据监控指标逐步优化 +2. **资源规划**: 合理分配训练和生成资源,避免瓶颈 +3. **监控预警**: 设置关键指标的阈值报警 +4. **定期检查**: 定期检查日志和性能指标 +5. **版本管理**: 记录配置变更和性能影响 -## 实验结果预期 +## 🤝 **贡献和反馈** -基于现有 one step off policy 的实验结果,完全异步训练预期能够: +欢迎提交issue和PR来改进这个异步训练系统! -- **训练速度**: 相比同步训练提升 30-50% -- **GPU 利用率**: 提升至 85-95% -- **内存开销**: 增加 20-30%(主要用于队列缓存) -- **模型收敛**: 与同步训练基本一致(在合理的新鲜度控制下) +## 📄 **更新日志** -## 后续改进 +### v2.0 (改进版本) +- ✅ 基于OneStepOffRayTrainer重构训练逻辑 +- ✅ 简化MessageQueue实现,去除ZeroMQ依赖 +- ✅ 改进参数同步机制,支持错误重试 +- ✅ 完善组件协调和监控系统 +- ✅ 优化错误处理和资源管理 +- ✅ 增加详细的性能指标和日志 -1. **自适应新鲜度控制**: 根据训练进度动态调整新鲜度阈值 -2. **多队列支持**: 支持不同优先级的样本队列 -3. **分布式队列**: 支持跨节点的分布式消息队列 -4. **更精细的资源调度**: 支持动态的资源分配和调整 +### v1.0 (原始版本) +- 基础异步训练框架 +- 简单的消息队列实现 +- 基本的参数同步功能 From 502de26f9ba6606dbef099db28cbc2b46551a0e3 Mon Sep 17 00:00:00 2001 From: ArronHZG Date: Wed, 30 Jul 2025 21:54:32 +0800 Subject: [PATCH 009/182] rollouter --- recipe/fully_async_policy/fully_async_rollouter.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/recipe/fully_async_policy/fully_async_rollouter.py b/recipe/fully_async_policy/fully_async_rollouter.py index c127b242704..3ece39d0f10 100644 --- a/recipe/fully_async_policy/fully_async_rollouter.py +++ b/recipe/fully_async_policy/fully_async_rollouter.py @@ -115,7 +115,8 @@ def get_status(self) -> dict: } -class Rollouter: +@ray.remote +class FullyAsyncRollouter: """ 异步样本生成器,负责持续生成训练样本并放入MessageQueue 基于OneStepOffRayTrainer的成熟实现改进 From dbdfdbfed992b95feb51f8563133ebfd6462b2ba Mon Sep 17 00:00:00 2001 From: ArronHZG Date: Thu, 31 Jul 2025 11:50:55 +0800 Subject: [PATCH 010/182] yaml --- .../config/fully_async_ppo_trainer.yaml | 153 ++++-------------- 1 file changed, 27 insertions(+), 126 deletions(-) diff --git a/recipe/fully_async_policy/config/fully_async_ppo_trainer.yaml b/recipe/fully_async_policy/config/fully_async_ppo_trainer.yaml index cbc7058f108..19c4aa01339 100644 --- a/recipe/fully_async_policy/config/fully_async_ppo_trainer.yaml +++ b/recipe/fully_async_policy/config/fully_async_ppo_trainer.yaml @@ -6,131 +6,32 @@ defaults: - ppo_trainer - _self_ -# 完全异步训练的特殊配置 -async_training: - # 新鲜度阈值,超过此版本差异的样本会被丢弃 - freshness_threshold: 3 - - # 最大允许的新鲜度差异,rollout会暂停生成 - max_staleness_allowed: 5 - - # MessageQueue的最大队列大小 - max_queue_size: 1000 - - # 最小batch数量,trainer会等待至少这么多batch - min_batch_count: 1 - - # 获取batch的超时时间(秒) - batch_timeout: 30.0 - -# 重写默认的训练配置 -actor_rollout_ref: - hybrid_engine: false - rollout: - # 异步模式 - mode: async - - # rollout专用的GPU数量 - n_gpus: 4 - - # 使用vLLM异步rollout - name: vllm - - # 其他rollout参数 - temperature: 1.0 - top_k: -1 - top_p: 1.0 - tensor_model_parallel_size: 2 - gpu_memory_utilization: 0.6 - max_num_batched_tokens: 8192 - free_cache_engine: true - enforce_eager: true - -# 训练器配置 -trainer: - # 总训练步数 - total_training_steps: 1000 - - # 设备 - device: cuda - - # 保存频率 - save_freq: 100 - - # 验证频率 - val_freq: 50 - - # 日志配置 - logger: '["console", "wandb"]' - project_name: "fully_async_ppo" - experiment_name: "test_async_training" +# ============= 完全异步训练配置 (Fully Async Training Config) ============= -# 数据配置 -data: - # 训练batch大小 - train_batch_size: 128 - - # 数据文件路径 - train_files: "~/data/train.parquet" - val_files: "~/data/val.parquet" - - # 序列长度 - max_prompt_length: 1024 - max_response_length: 1024 - -# 算法配置 -algorithm: - # 优势估计器 - adv_estimator: gae - - # PPO参数 - cliprange: 0.2 - cliprange_value: 0.2 - vf_coeff: 0.1 - entropy_coeff: 0.01 - - # KL相关 - kl_coeff: 0.1 - adaptive_kl: true - target_kl: 0.01 - -# 模型配置 -actor_rollout_ref: - model: - # 模型路径 - path: "Qwen/Qwen2-7B-Instruct" - - # 使用LoRA - lora_rank: 64 - lora_alpha: 128 - lora_dropout: 0.1 - - actor: - # Actor优化器 - optim: - lr: 1e-6 - weight_decay: 0.01 - - # FSDP配置 - fsdp_config: - fsdp_size: -1 - param_offload: false - optimizer_offload: false - - # PPO配置 - ppo_mini_batch_size: 32 - use_dynamic_bsz: true - -# Critic配置 -critic: - model: - path: "Qwen/Qwen2-7B-Instruct" - - optim: - lr: 1e-5 - weight_decay: 0.01 - - fsdp_config: - fsdp_size: -1 - param_offload: false +async_training: + # 新鲜度控制 (Freshness Control) + freshness_threshold: 3 # 样本新鲜度阈值 + max_staleness_allowed: 5 # 最大允许的样本陈旧度 + + # 队列管理 (Queue Management) + max_queue_size: 1000 # 消息队列最大大小 + min_batch_count: 1 # 每次获取的最小batch数量 + batch_timeout: 30.0 # 获取batch的超时时间(秒) + + # 生成控制 (Generation Control) + generation_timeout: 30.0 # 单次生成的超时时间(秒) + batch_generation_interval: 0.1 # batch生成间隔(秒) + + # 参数同步 (Parameter Synchronization) + max_sync_retries: 3 # 参数同步最大重试次数 + sync_timeout: 30.0 # 同步超时时间(秒) + sync_retry_delay: 1.0 # 重试延迟时间(秒) + +# Rollout配置 +rollout: + nnodes: 1 # Number of nodes used in the rollout + n_gpus_per_node: 8 # Number of GPUs per node + mode: async # rollout模式: sync, async + name: vllm # rollout引擎: vllm, sglang + n: 4 # 每个prompt生成的响应数量 From 08c1ba14b9e93aae9bf7c91410d4512587d627a2 Mon Sep 17 00:00:00 2001 From: ArronHZG Date: Thu, 31 Jul 2025 14:09:07 +0800 Subject: [PATCH 011/182] trainer --- .../fully_async_policy/fully_async_trainer.py | 395 ++++++++++++------ recipe/fully_async_policy/param_sync.py | 305 ++++++++++++-- recipe/fully_async_policy/test_fully_async.py | 2 +- 3 files changed, 535 insertions(+), 167 deletions(-) diff --git a/recipe/fully_async_policy/fully_async_trainer.py b/recipe/fully_async_policy/fully_async_trainer.py index 2487387b163..e66bc895c9c 100644 --- a/recipe/fully_async_policy/fully_async_trainer.py +++ b/recipe/fully_async_policy/fully_async_trainer.py @@ -13,10 +13,12 @@ # limitations under the License. import logging +import time from pprint import pprint import numpy as np import ray +import torch from omegaconf import OmegaConf from torch.utils.data import Dataset, Sampler from tqdm import tqdm @@ -25,12 +27,15 @@ from verl import DataProto from verl.single_controller.ray import RayClassWithInitArgs, RayWorkerGroup from verl.single_controller.ray.base import create_colocated_worker_cls +from verl.trainer.ppo import core_algos +from verl.trainer.ppo.core_algos import AdvantageEstimator, agg_loss from verl.trainer.ppo.metric_utils import ( compute_data_metrics, compute_throughout_metrics, compute_timing_metrics, ) from verl.trainer.ppo.ray_trainer import ( + RayPPOTrainer, ResourcePoolManager, Role, WorkerType, @@ -46,9 +51,11 @@ logger = logging.getLogger(__name__) -class FullyAsyncTrainer: +@ray.remote +class FullyAsyncTrainer(RayPPOTrainer): """ 完全异步的PPO训练器,从MessageQueue获取样本进行训练 + 基于OneStepOffRayTrainer的成熟实现改进 """ def __init__( @@ -73,6 +80,9 @@ def __init__( self.reward_fn = reward_fn self.val_reward_fn = val_reward_fn + self.hybrid_engine = config.actor_rollout_ref.hybrid_engine + assert not self.hybrid_engine + self.role_worker_mapping = role_worker_mapping self.resource_pool_manager = resource_pool_manager self.ray_worker_group_cls = ray_worker_group_cls @@ -85,12 +95,32 @@ def __init__( self.collate_fn = collate_fn self.train_sampler = train_sampler - # 角色配置 + # 角色配置 - 参考OneStepOffRayTrainer的配置 self.use_reference_policy = Role.RefPolicy in role_worker_mapping self.use_rm = Role.RewardModel in role_worker_mapping - self.use_critic = Role.Critic in role_worker_mapping self.ref_in_actor = config.actor_rollout_ref.model.get("lora_rank", 0) > 0 + # KL控制器 + if config.algorithm.use_kl_in_reward: + self.kl_ctrl_in_reward = core_algos.get_kl_controller(config.algorithm.kl_ctrl) + + # 确定是否使用critic - 参考OneStepOffRayTrainer的逻辑 + if self.config.algorithm.adv_estimator == AdvantageEstimator.GAE: + self.use_critic = True + elif self.config.algorithm.adv_estimator in [ + AdvantageEstimator.GRPO, + AdvantageEstimator.GRPO_PASSK, + AdvantageEstimator.REINFORCE_PLUS_PLUS, + # AdvantageEstimator.REMAX, # TODO:REMAX advantage estimator is not yet supported in one_step_off_policy + AdvantageEstimator.RLOO, + AdvantageEstimator.OPO, + AdvantageEstimator.REINFORCE_PLUS_PLUS_BASELINE, + AdvantageEstimator.GPG, + ]: + self.use_critic = False + else: + raise NotImplementedError(f"Unsupported advantage estimator: {self.config.algorithm.adv_estimator}") + # Worker groups self.actor_wg = None self.critic_wg = None @@ -111,6 +141,17 @@ def __init__( # 统计信息 self.processed_samples = 0 self.stale_samples_processed = 0 + self.param_sync_count = 0 + + self._validate_config() + + def _validate_config(self): + """验证配置""" + required_configs = ["trainer.total_training_steps", "algorithm.adv_estimator", "data.train_batch_size"] + + for config_path in required_configs: + if not OmegaConf.select(self.config, config_path): + raise ValueError(f"Missing required config: {config_path}") def set_message_queue_client(self, message_queue_client: MessageQueueClient): """设置消息队列客户端""" @@ -121,50 +162,58 @@ def set_rollouter_actor(self, rollouter_actor): self.rollouter_actor = rollouter_actor def init_workers(self): - """初始化训练workers""" + """初始化训练workers - 参考OneStepOffRayTrainer的实现""" logger.info("Initializing FullyAsyncTrainer workers...") self.resource_pool_manager.create_resource_pool() self.resource_pool_to_cls = {pool: {} for pool in self.resource_pool_manager.resource_pool_dict.values()} # 创建actor worker - actor_resource_pool = self.resource_pool_manager.get_resource_pool(Role.Actor) + resource_pool = self.resource_pool_manager.get_resource_pool(Role.Actor) actor_cls = RayClassWithInitArgs( cls=self.role_worker_mapping[Role.Actor], config=self.config.actor_rollout_ref, role="actor", ) - self.resource_pool_to_cls[actor_resource_pool]["actor"] = actor_cls + self.resource_pool_to_cls[resource_pool]["actor"] = actor_cls # 创建critic worker if self.use_critic: - critic_resource_pool = self.resource_pool_manager.get_resource_pool(Role.Critic) + resource_pool = self.resource_pool_manager.get_resource_pool(Role.Critic) critic_cls = RayClassWithInitArgs(cls=self.role_worker_mapping[Role.Critic], config=self.config.critic) - self.resource_pool_to_cls[critic_resource_pool]["critic"] = critic_cls + self.resource_pool_to_cls[resource_pool]["critic"] = critic_cls # 创建reference policy worker if self.use_reference_policy: - ref_resource_pool = self.resource_pool_manager.get_resource_pool(Role.RefPolicy) + resource_pool = self.resource_pool_manager.get_resource_pool(Role.RefPolicy) ref_policy_cls = RayClassWithInitArgs( cls=self.role_worker_mapping[Role.RefPolicy], config=self.config.actor_rollout_ref, role="ref", ) - self.resource_pool_to_cls[ref_resource_pool]["ref"] = ref_policy_cls + self.resource_pool_to_cls[resource_pool]["ref"] = ref_policy_cls # 创建reward model worker if self.use_rm: - rm_resource_pool = self.resource_pool_manager.get_resource_pool(Role.RewardModel) + resource_pool = self.resource_pool_manager.get_resource_pool(Role.RewardModel) rm_cls = RayClassWithInitArgs( cls=self.role_worker_mapping[Role.RewardModel], config=self.config.reward_model ) - self.resource_pool_to_cls[rm_resource_pool]["rm"] = rm_cls + self.resource_pool_to_cls[resource_pool]["rm"] = rm_cls - # 初始化WorkerGroup + # 初始化WorkerGroup - 参考OneStepOffRayTrainer的实现 all_wg = {} wg_kwargs = {} if OmegaConf.select(self.config.trainer, "ray_wait_register_center_timeout") is not None: wg_kwargs["ray_wait_register_center_timeout"] = self.config.trainer.ray_wait_register_center_timeout + if OmegaConf.select(self.config.trainer, "profile_steps") is not None: + wg_kwargs["profile_steps"] = OmegaConf.select(self.config.trainer, "profile_steps") + assert OmegaConf.select(self.config.trainer, "worker_nsight_options") is not None, ( + "worker_nsight_options must be set when profile_steps is set" + ) + wg_kwargs["worker_nsight_options"] = OmegaConf.to_container( + OmegaConf.select(self.config.trainer, "worker_nsight_options") + ) for resource_pool, class_dict in self.resource_pool_to_cls.items(): worker_dict_cls = create_colocated_worker_cls(class_dict=class_dict) @@ -197,55 +246,98 @@ def init_workers(self): def _load_checkpoint(self): """加载检查点""" - # 简化的检查点加载逻辑 - pass + # TODO: 实现检查点加载逻辑 + logger.info("Checkpoint loading not implemented yet") def _validate(self): - """执行验证""" + """执行验证 - 参考OneStepOffRayTrainer的验证逻辑""" if self.val_reward_fn is None: return None - # 简化的验证逻辑 - logger.info("Validation step skipped in async trainer") - return {"val_reward": 0.0} + # TODO: 实现完整的验证逻辑 + logger.info("Running validation...") + val_metrics = {"val_reward": 0.0} # 简化的验证指标 + return val_metrics def _save_checkpoint(self): """保存检查点""" - # 简化的检查点保存逻辑 - pass + # TODO: 实现检查点保存逻辑 + logger.info("Checkpoint saving not implemented yet") def _dump_generations(self, inputs, outputs, scores, reward_extra_infos_dict, dump_path): """保存生成结果""" - # 简化的生成结果保存逻辑 + # TODO: 实现生成结果保存逻辑 + logger.debug(f"Dumping generations to {dump_path}") + + def _balance_batch(self, batch: DataProto, metrics: dict): + """平衡batch中的有效token数量 - 参考OneStepOffRayTrainer的实现""" + # TODO: 实现batch平衡逻辑 pass - def _update_param_version_and_sync(self): - """更新参数版本并同步到Rollouter""" + def _sync_parameters_to_rollouter(self): + """同步参数到Rollouter - 改进的同步机制""" + if self.rollouter_actor is None: + logger.warning("Rollouter actor not set, skipping parameter sync") + return + self.current_param_version += 1 - # 通知MessageQueue更新参数版本 - self.message_queue_client.update_param_version(self.current_param_version) + try: + # 通知MessageQueue更新参数版本 + self.message_queue_client.update_param_version(self.current_param_version) - # 通知Rollouter更新参数 - if self.rollouter_actor is not None: - ray.get(self.rollouter_actor.update_rollout_weights.remote(self.current_param_version)) + # 同步参数到Rollouter + sync_future = self.rollouter_actor.update_rollout_weights.remote(self.current_param_version) + ray.get(sync_future) + + self.param_sync_count += 1 + logger.info(f"Parameter sync completed, version: {self.current_param_version}") + + except Exception as e: + logger.error(f"Failed to sync parameters: {e}") + self.current_param_version -= 1 # 回滚版本号 + raise def _process_batch_samples(self, batch_samples: list[BatchSample]) -> DataProto: - """处理从队列获取的batch样本""" + """处理从队列获取的batch样本 - 改进的批处理逻辑""" + if not batch_samples: + raise ValueError("Empty batch samples") + if len(batch_samples) == 1: return batch_samples[0].data - # 如果有多个batch,需要合并 - all_batches = [sample.data for sample in batch_samples] - return DataProto.concat(all_batches) + # 合并多个batch - 使用DataProto的concat方法 + try: + all_batches = [sample.data for sample in batch_samples] + merged_batch = DataProto.concat(all_batches) + logger.debug(f"Successfully merged {len(batch_samples)} batches") + return merged_batch + except Exception as e: + logger.error(f"Failed to merge batch samples: {e}") + raise + + def _compute_sample_freshness_metrics(self, batch_samples: list[BatchSample]) -> dict: + """计算样本新鲜度指标""" + sample_ages = [self.current_param_version - sample.param_version for sample in batch_samples] + current_time = time.time() + sample_latencies = [current_time - sample.timestamp for sample in batch_samples] + + return { + "freshness/avg_sample_age": np.mean(sample_ages), + "freshness/max_sample_age": max(sample_ages), + "freshness/min_sample_age": min(sample_ages), + "freshness/avg_sample_latency": np.mean(sample_latencies), + "freshness/max_sample_latency": max(sample_latencies), + "freshness/stale_samples_ratio": sum(1 for age in sample_ages if age > 1) / len(sample_ages), + } def fit(self): - """主训练循环""" + """主训练循环 - 基于OneStepOffRayTrainer的成熟实现""" from omegaconf import OmegaConf from verl.utils.tracking import Tracking - logger = Tracking( + logger_tracker = Tracking( project_name=self.config.trainer.project_name, experiment_name=self.config.trainer.experiment_name, default_backend=self.config.trainer.logger, @@ -257,17 +349,17 @@ def fit(self): # 加载检查点 self._load_checkpoint() - # 验证 + # 初始验证 if self.val_reward_fn is not None and self.config.trainer.get("val_before_train", True): val_metrics = self._validate() if val_metrics: pprint(f"Initial validation metrics: {val_metrics}") - logger.log(data=val_metrics, step=self.global_steps) + logger_tracker.log(data=val_metrics, step=self.global_steps) if self.config.trainer.get("val_only", False): return # 进度条 - progress_bar = tqdm(total=self.total_training_steps, initial=self.global_steps, desc="Training Progress") + progress_bar = tqdm(total=self.total_training_steps, initial=self.global_steps, desc="Async Training") self.global_steps += 1 last_val_metrics = None @@ -278,6 +370,7 @@ def fit(self): logger.info("Starting fully async training loop...") while self.global_steps <= self.total_training_steps: + # 性能分析 do_profile = ( self.global_steps in self.config.trainer.profile_steps if self.config.trainer.profile_steps is not None @@ -286,7 +379,7 @@ def fit(self): if do_profile: self.actor_wg.start_profile() - if self.use_reference_policy: + if self.use_reference_policy and not self.ref_in_actor: self.ref_policy_wg.start_profile() if self.use_critic: self.critic_wg.start_profile() @@ -295,7 +388,7 @@ def fit(self): metrics = {} timing_raw = {} - # is_last_step = self.global_steps >= self.total_training_steps + is_last_step = self.global_steps >= self.total_training_steps with marked_timer("step", timing_raw): # 从队列获取样本 @@ -308,64 +401,102 @@ def fit(self): ) if batch_samples is None: - logger.warning("Timeout waiting for batch samples, continuing...") + logger.warning("Timeout waiting for batch samples, retrying...") + time.sleep(1.0) continue # 处理获取的样本 - batch = self._process_batch_samples(batch_samples) + with marked_timer("process_batch_samples", timing_raw, color="cyan"): + batch = self._process_batch_samples(batch_samples) - # 计算样本的新鲜度 - sample_ages = [self.current_param_version - sample.param_version for sample in batch_samples] - avg_sample_age = np.mean(sample_ages) - max_sample_age = max(sample_ages) + # 计算样本新鲜度指标 + freshness_metrics = self._compute_sample_freshness_metrics(batch_samples) + metrics.update(freshness_metrics) - logger.info( - f"Processing batch with {len(batch_samples)} samples, " - f"avg_age={avg_sample_age:.1f}, max_age={max_sample_age}" - ) + logger.info( + f"Processing batch: {len(batch_samples)} samples, " + f"avg_age={freshness_metrics['freshness/avg_sample_age']:.1f}, " + f"max_age={freshness_metrics['freshness/max_sample_age']}" + ) - # 添加响应掩码 + # 添加响应掩码 - 参考OneStepOffRayTrainer batch.batch["response_mask"] = compute_response_mask(batch) - # 计算奖励 - with marked_timer("compute_reward", timing_raw, color="yellow"): - if self.reward_fn is not None: - batch, reward_extra_infos_dict = compute_reward( - batch, reward_fn=self.reward_fn, tokenizer=self.tokenizer - ) - elif self.use_rm: - batch, reward_extra_infos_dict = compute_reward_async( - batch, rm_wg=self.rm_wg, tokenizer=self.tokenizer - ) + # 平衡batch + if self.config.trainer.balance_batch: + self._balance_batch(batch, metrics=metrics) + + # 计算全局有效token数量 + batch.meta_info["global_token_num"] = torch.sum(batch.batch["attention_mask"], dim=-1).tolist() + + # 计算奖励 - 参考OneStepOffRayTrainer的实现 + with marked_timer("reward", timing_raw, color="yellow"): + if self.use_rm: + reward_tensor = self.rm_wg.compute_rm_score(batch) + batch = batch.union(reward_tensor) + + if self.config.reward_model.get("launch_reward_fn_async", False): + future_reward = compute_reward_async.remote(batch, self.config, self.tokenizer) else: - raise ValueError("No reward function or reward model provided") + reward_tensor, reward_extra_infos_dict = compute_reward(batch, self.reward_fn) + + # 计算旧的log probabilities - 参考OneStepOffRayTrainer + with marked_timer("old_log_prob", timing_raw, color="blue"): + old_log_prob = self.actor_wg.compute_log_prob(batch) + entropys = old_log_prob.batch["entropys"] + response_masks = batch.batch["response_mask"] + loss_agg_mode = self.config.actor_rollout_ref.actor.loss_agg_mode + entropy_agg = agg_loss(loss_mat=entropys, loss_mask=response_masks, loss_agg_mode=loss_agg_mode) + old_log_prob_metrics = {"actor/entropy": entropy_agg.detach().item()} + metrics.update(old_log_prob_metrics) + old_log_prob.batch.pop("entropys") + batch = batch.union(old_log_prob) # 计算reference log probabilities if self.use_reference_policy: - with marked_timer("compute_ref_log_prob", timing_raw, color="green"): - if self.ref_in_actor: - ref_log_prob_output = self.actor_wg.compute_ref_log_prob(batch) + with marked_timer("ref", timing_raw, color="olive"): + if not self.ref_in_actor: + ref_log_prob = self.ref_policy_wg.compute_ref_log_prob(batch) else: - ref_log_prob_output = self.ref_policy_wg.compute_log_prob(batch) - batch = batch.union(ref_log_prob_output) - - # 计算actor log probabilities - with marked_timer("compute_log_prob", timing_raw, color="cyan"): - log_prob_output = self.actor_wg.compute_log_prob(batch) - batch = batch.union(log_prob_output) - - # 应用KL惩罚 - if self.use_reference_policy: - batch = apply_kl_penalty(batch, self.config.algorithm) + ref_log_prob = self.actor_wg.compute_ref_log_prob(batch) + batch = batch.union(ref_log_prob) - # 计算优势 + # 计算values if self.use_critic: - with marked_timer("compute_values", timing_raw, color="magenta"): - values_output = self.critic_wg.compute_values(batch) - batch = batch.union(values_output) - - with marked_timer("compute_advantage", timing_raw, color="orange"): - batch = compute_advantage(batch, self.config.algorithm) + with marked_timer("values", timing_raw, color="cyan"): + values = self.critic_wg.compute_values(batch) + batch = batch.union(values) + + # 处理奖励和优势计算 + with marked_timer("adv", timing_raw, color="brown"): + if self.config.reward_model.get("launch_reward_fn_async", False): + reward_tensor, reward_extra_infos_dict = ray.get(future_reward) + batch.batch["token_level_scores"] = reward_tensor + + if reward_extra_infos_dict: + batch.non_tensor_batch.update({k: np.array(v) for k, v in reward_extra_infos_dict.items()}) + + # 应用KL惩罚 + if self.config.algorithm.use_kl_in_reward: + batch, kl_metrics = apply_kl_penalty( + batch, kl_ctrl=self.kl_ctrl_in_reward, kl_penalty=self.config.algorithm.kl_penalty + ) + metrics.update(kl_metrics) + else: + batch.batch["token_level_rewards"] = batch.batch["token_level_scores"] + + # 计算优势 + norm_adv_by_std_in_grpo = self.config.algorithm.get("norm_adv_by_std_in_grpo", True) + + batch = compute_advantage( + batch, + adv_estimator=self.config.algorithm.adv_estimator, + gamma=self.config.algorithm.gamma, + lam=self.config.algorithm.lam, + num_repeat=self.config.actor_rollout_ref.rollout.n, + norm_adv_by_std_in_grpo=norm_adv_by_std_in_grpo, + config=self.config.algorithm, + ) # 更新critic if self.use_critic: @@ -382,9 +513,9 @@ def fit(self): actor_output_metrics = reduce_metrics(actor_output.meta_info["metrics"]) metrics.update(actor_output_metrics) - # 更新参数版本并同步到Rollouter + # 同步参数到Rollouter with marked_timer("sync_params", timing_raw, color="purple"): - self._update_param_version_and_sync() + self._sync_parameters_to_rollouter() # 记录rollout生成 rollout_data_dir = self.config.trainer.get("rollout_data_dir", None) @@ -404,39 +535,54 @@ def fit(self): # 验证 if ( self.val_reward_fn is not None - and self.config.trainer.val_freq is not None - and self.global_steps % self.config.trainer.val_freq == 0 + and self.config.trainer.test_freq > 0 + and (is_last_step or self.global_steps % self.config.trainer.test_freq == 0) ): - with marked_timer("validation", timing_raw, color="brown"): + with marked_timer("testing", timing_raw, color="green"): val_metrics = self._validate() - if val_metrics: - pprint(f"Validation metrics at step {self.global_steps}: {val_metrics}") + if is_last_step: last_val_metrics = val_metrics + print(last_val_metrics) + if val_metrics: + metrics.update(val_metrics) - # 计算性能指标 - timing_metrics = compute_timing_metrics(timing_raw) - throughput_metrics = compute_throughout_metrics(timing_raw, len(batch)) - data_metrics = compute_data_metrics(batch, self.tokenizer) + # 保存检查点 + if self.config.trainer.save_freq > 0 and ( + is_last_step or self.global_steps % self.config.trainer.save_freq == 0 + ): + with marked_timer("save_checkpoint", timing_raw, color="green"): + self._save_checkpoint() - # 添加样本新鲜度指标 - freshness_metrics = { - "avg_sample_age": avg_sample_age, - "max_sample_age": max_sample_age, - "processed_samples": self.processed_samples, - "param_version": self.current_param_version, - } + # 收集指标 - 参考OneStepOffRayTrainer的指标收集 + metrics.update( + { + "training/global_step": self.global_steps, + "training/param_version": self.current_param_version, + "training/param_sync_count": self.param_sync_count, + } + ) - metrics.update(timing_metrics) - metrics.update(throughput_metrics) - metrics.update(data_metrics) - metrics.update(freshness_metrics) + # 数据和性能指标 + metrics.update(compute_data_metrics(batch=batch, use_critic=self.use_critic)) + metrics.update(compute_timing_metrics(batch=batch, timing_raw=timing_raw)) - if last_val_metrics is not None: - metrics.update(last_val_metrics) - last_val_metrics = None + n_gpus = self.resource_pool_manager.get_n_gpus() + metrics.update(compute_throughout_metrics(batch=batch, timing_raw=timing_raw, n_gpus=n_gpus)) + + # 队列状态指标 + queue_size = self.message_queue_client.get_queue_size() + queue_stats = self.message_queue_client.get_statistics() + metrics.update( + { + "queue/size": queue_size, + "queue/total_produced": queue_stats["total_produced"], + "queue/total_consumed": queue_stats["total_consumed"], + "queue/dropped_samples": queue_stats["dropped_samples"], + } + ) # 记录日志 - logger.log(data=metrics, step=self.global_steps) + logger_tracker.log(data=metrics, step=self.global_steps) # 更新进度条 progress_bar.update(1) @@ -444,27 +590,27 @@ def fit(self): { "reward": f"{metrics.get('reward/mean', 0):.3f}", "kl": f"{metrics.get('actor/approx_kl', 0):.3f}", - "queue_size": self.message_queue_client.get_queue_size(), - "param_version": self.current_param_version, + "queue_size": queue_size, + "param_ver": self.current_param_version, + "avg_age": f"{metrics.get('freshness/avg_sample_age', 0):.1f}", } ) - # 保存检查点 - if self.config.trainer.save_freq is not None and self.global_steps % self.config.trainer.save_freq == 0: - self._save_checkpoint() - if do_profile: - self.actor_wg.end_profile() - if self.use_reference_policy: - self.ref_policy_wg.end_profile() + self.actor_wg.stop_profile() + if self.use_reference_policy and not self.ref_in_actor: + self.ref_policy_wg.stop_profile() if self.use_critic: - self.critic_wg.end_profile() + self.critic_wg.stop_profile() if self.use_rm: - self.rm_wg.end_profile() + self.rm_wg.stop_profile() self.global_steps += 1 self.processed_samples += len(batch_samples) + if is_last_step: + break + progress_bar.close() logger.info(f"Training completed after {self.global_steps} steps") @@ -473,17 +619,22 @@ def fit(self): val_metrics = self._validate() if val_metrics: pprint(f"Final validation metrics: {val_metrics}") - logger.log(data=val_metrics, step=self.global_steps) + logger_tracker.log(data=val_metrics, step=self.global_steps) # 最终检查点保存 self._save_checkpoint() def get_statistics(self) -> dict: """获取训练统计信息""" + queue_stats = self.message_queue_client.get_statistics() if self.message_queue_client else {} return { "global_steps": self.global_steps, "processed_samples": self.processed_samples, "stale_samples_processed": self.stale_samples_processed, "current_param_version": self.current_param_version, - "queue_size": self.message_queue_client.get_queue_size() if self.message_queue_client else 0, + "param_sync_count": self.param_sync_count, + "queue_size": queue_stats.get("queue_size", 0), + "queue_total_produced": queue_stats.get("total_produced", 0), + "queue_total_consumed": queue_stats.get("total_consumed", 0), + "queue_dropped_samples": queue_stats.get("dropped_samples", 0), } diff --git a/recipe/fully_async_policy/param_sync.py b/recipe/fully_async_policy/param_sync.py index 272f890cbbc..023475ef777 100644 --- a/recipe/fully_async_policy/param_sync.py +++ b/recipe/fully_async_policy/param_sync.py @@ -13,6 +13,7 @@ # limitations under the License. import logging +import time import ray from ray.util.collective import collective @@ -23,64 +24,146 @@ class ParameterSynchronizer: """ 参数同步器,负责在actor和rollout之间同步模型参数 + 改进版本,具有更好的错误处理和重试机制 """ def __init__(self, config): self.config = config self.weights_info = None self.sync_group_initialized = False + self.sync_group_name = "actor_rollout" - def initialize_sync_group(self, actor_workers: list, rollout_workers: list): + # 同步配置 + self.max_sync_retries = config.async_training.get("max_sync_retries", 3) + self.sync_timeout = config.async_training.get("sync_timeout", 30.0) + self.retry_delay = config.async_training.get("sync_retry_delay", 1.0) + + # 统计信息 + self.sync_count = 0 + self.sync_failures = 0 + self.last_sync_time = 0 + + def initialize_sync_group(self, actor_workers: list, rollout_workers: list) -> bool: """ 初始化参数同步组 Args: actor_workers: actor worker列表 rollout_workers: rollout worker列表 + + Returns: + bool: 是否成功初始化 """ logger.info("Initializing parameter synchronization group...") try: + # 验证workers + if not actor_workers: + raise ValueError("No actor workers provided") + if not rollout_workers: + raise ValueError("No rollout workers provided") + # 获取actor的权重信息 - if actor_workers: - self.weights_info = ray.get(actor_workers[0].get_actor_weights_info.remote())[0] + logger.debug("Getting actor weights info...") + weights_info_future = actor_workers[0].get_actor_weights_info.remote() + self.weights_info = ray.get(weights_info_future, timeout=10.0)[0] - # 设置rollout的权重信息 - for rollout_worker in rollout_workers: - ray.get(rollout_worker.set_actor_weights_info.remote(self.weights_info)) + if not self.weights_info: + raise ValueError("Failed to get actor weights info") + + # 设置rollout的权重信息 + logger.debug("Setting rollout weights info...") + set_weights_futures = [] + for rollout_worker in rollout_workers: + future = rollout_worker.set_actor_weights_info.remote(self.weights_info) + set_weights_futures.append(future) + + ray.get(set_weights_futures, timeout=10.0) # 创建actor-rollout通信组 + logger.debug("Creating collective communication group...") all_workers = actor_workers + rollout_workers + + # 清理可能存在的旧组 + try: + collective.destroy_collective_group(self.sync_group_name) + except Exception: + pass # 忽略清理错误 + collective.create_collective_group( all_workers, len(all_workers), list(range(0, len(all_workers))), backend="nccl", - group_name="actor_rollout", + group_name=self.sync_group_name, ) self.sync_group_initialized = True logger.info("Parameter synchronization group initialized successfully") + return True except Exception as e: logger.error(f"Failed to initialize sync group: {e}") - raise + self.sync_group_initialized = False + return False - def sync_weights(self, actor_workers: list, rollout_workers: list): + def sync_weights(self, actor_workers: list, rollout_workers: list) -> bool: """ - 同步权重从actor到rollout + 同步权重从actor到rollout - 改进版本,具有重试和错误处理 Args: actor_workers: actor worker列表 rollout_workers: rollout worker列表 + + Returns: + bool: 是否同步成功 """ if not self.sync_group_initialized: - raise RuntimeError("Sync group not initialized. Call initialize_sync_group() first.") + logger.error("Sync group not initialized. Call initialize_sync_group() first.") + return False + + logger.debug("Starting weight synchronization...") + start_time = time.time() + + for attempt in range(self.max_sync_retries): + try: + # 执行同步 + success = self._execute_sync(actor_workers, rollout_workers) + + if success: + self.sync_count += 1 + self.last_sync_time = time.time() + sync_duration = self.last_sync_time - start_time + logger.debug(f"Weight synchronization completed in {sync_duration:.2f}s") + return True + else: + logger.warning(f"Sync attempt {attempt + 1} failed") + + except Exception as e: + logger.warning(f"Sync attempt {attempt + 1} failed with error: {e}") + + # 如果不是最后一次尝试,等待后重试 + if attempt < self.max_sync_retries - 1: + logger.info(f"Retrying sync in {self.retry_delay}s...") + time.sleep(self.retry_delay) + + # 所有重试都失败 + self.sync_failures += 1 + logger.error(f"All sync attempts failed. Total failures: {self.sync_failures}") + return False + + def _execute_sync(self, actor_workers: list, rollout_workers: list) -> bool: + """ + 执行实际的同步操作 - logger.debug("Synchronizing weights from actor to rollout...") + Args: + actor_workers: actor worker列表 + rollout_workers: rollout worker列表 + Returns: + bool: 是否同步成功 + """ try: - # 同步权重 sync_futures = [] # Actor端同步 @@ -93,20 +176,39 @@ def sync_weights(self, actor_workers: list, rollout_workers: list): future = rollout_worker.sync_rollout_weights.remote() sync_futures.append(future) - # 等待所有同步完成 - ray.get(sync_futures) - - logger.debug("Weight synchronization completed") + # 等待所有同步完成,带超时 + ray.get(sync_futures, timeout=self.sync_timeout) + return True except Exception as e: - logger.error(f"Failed to sync weights: {e}") - raise + logger.error(f"Sync execution failed: {e}") + return False + + def cleanup(self): + """清理同步组""" + if self.sync_group_initialized: + try: + collective.destroy_collective_group(self.sync_group_name) + logger.info("Sync group cleaned up") + except Exception as e: + logger.warning(f"Error cleaning up sync group: {e}") + finally: + self.sync_group_initialized = False + + def get_statistics(self) -> dict: + """获取同步统计信息""" + return { + "sync_count": self.sync_count, + "sync_failures": self.sync_failures, + "last_sync_time": self.last_sync_time, + "sync_group_initialized": self.sync_group_initialized, + } @ray.remote class ParameterSyncManager: """ - Ray Actor形式的参数同步管理器 + Ray Actor形式的参数同步管理器 - 改进版本 """ def __init__(self, config): @@ -114,28 +216,69 @@ def __init__(self, config): self.synchronizer = ParameterSynchronizer(config) self.actor_workers = [] self.rollout_workers = [] + self.is_ready = False - def register_workers(self, actor_workers: list, rollout_workers: list): - """注册worker""" - self.actor_workers = actor_workers - self.rollout_workers = rollout_workers + def register_workers(self, actor_workers: list, rollout_workers: list) -> bool: + """ + 注册worker - # 初始化同步组 - self.synchronizer.initialize_sync_group(actor_workers, rollout_workers) + Args: + actor_workers: actor worker列表 + rollout_workers: rollout worker列表 + + Returns: + bool: 是否成功注册 + """ + try: + self.actor_workers = actor_workers + self.rollout_workers = rollout_workers + + # 初始化同步组 + success = self.synchronizer.initialize_sync_group(actor_workers, rollout_workers) + self.is_ready = success + + if success: + logger.info("ParameterSyncManager ready") + else: + logger.error("ParameterSyncManager initialization failed") + + return success + except Exception as e: + logger.error(f"Failed to register workers: {e}") + return False + + def sync_parameters(self) -> bool: + """ + 执行参数同步 - def sync_parameters(self): - """执行参数同步""" - self.synchronizer.sync_weights(self.actor_workers, self.rollout_workers) - return True + Returns: + bool: 是否同步成功 + """ + if not self.is_ready: + logger.error("SyncManager not ready. Call register_workers() first.") + return False + + return self.synchronizer.sync_weights(self.actor_workers, self.rollout_workers) def get_weights_info(self): """获取权重信息""" return self.synchronizer.weights_info + def get_statistics(self) -> dict: + """获取统计信息""" + stats = self.synchronizer.get_statistics() + stats["is_ready"] = self.is_ready + return stats + + def cleanup(self): + """清理资源""" + self.synchronizer.cleanup() + self.is_ready = False + class AsyncParameterSynchronizer: """ - 异步参数同步器,用于完全异步训练工作流 + 异步参数同步器,用于完全异步训练工作流 - 改进版本 """ def __init__(self, config, actor_wg, rollouter_actor): @@ -150,26 +293,100 @@ def __init__(self, config, actor_wg, rollouter_actor): self.rollouter_actor = rollouter_actor self.current_version = 0 - def sync_to_rollouter(self, new_version: int): - """ - 将actor参数同步到rollouter + # 同步配置 + self.sync_timeout = config.async_training.get("sync_timeout", 30.0) + self.max_sync_retries = config.async_training.get("max_sync_retries", 3) + self.retry_delay = config.async_training.get("sync_retry_delay", 1.0) - Args: - new_version: 新的参数版本号 - """ - logger.info(f"Syncing parameters to rollouter, version: {new_version}") + # 统计信息 + self.sync_count = 0 + self.sync_failures = 0 + self.last_sync_time = 0 + + # 初始化同步组 + self._init_sync_group() + def _init_sync_group(self): + """初始化同步组""" try: - # 通知rollouter更新参数 - ray.get(self.rollouter_actor.update_rollout_weights.remote(new_version)) + # 获取actor权重信息 + weights_info = self.actor_wg.get_actor_weights_info()[0] + + # 通知rollouter设置权重信息 + ray.get(self.rollouter_actor.set_weights_info.remote(weights_info), timeout=10.0) + + # 创建同步通信组 + actor_workers = self.actor_wg.workers + rollout_workers = ray.get(self.rollouter_actor.get_rollout_workers.remote(), timeout=10.0) + + all_workers = actor_workers + rollout_workers + collective.create_collective_group( + all_workers, + len(all_workers), + list(range(0, len(all_workers))), + backend="nccl", + group_name="async_actor_rollout", + ) - self.current_version = new_version - logger.info(f"Parameter sync to rollouter completed, version: {new_version}") + logger.info("Async parameter synchronizer initialized") except Exception as e: - logger.error(f"Failed to sync parameters to rollouter: {e}") - raise + logger.warning(f"Failed to initialize async sync group: {e}") + + def sync_to_rollouter(self, new_version: int) -> bool: + """ + 将actor参数同步到rollouter - 改进版本,具有重试机制 + + Args: + new_version: 新的参数版本号 + + Returns: + bool: 是否同步成功 + """ + logger.info(f"Syncing parameters to rollouter, version: {new_version}") + start_time = time.time() + + for attempt in range(self.max_sync_retries): + try: + # 首先同步actor到rollout worker group + self.actor_wg.sync_rollout_weights() + + # 然后通知rollouter更新参数版本 + sync_future = self.rollouter_actor.update_rollout_weights.remote(new_version) + sync_result = ray.get(sync_future, timeout=self.sync_timeout) + + if sync_result: + self.current_version = new_version + self.sync_count += 1 + self.last_sync_time = time.time() + sync_duration = self.last_sync_time - start_time + logger.info(f"Parameter sync completed in {sync_duration:.2f}s, version: {new_version}") + return True + else: + logger.warning(f"Rollouter rejected sync for version {new_version}") + + except Exception as e: + logger.warning(f"Sync attempt {attempt + 1} failed: {e}") + + # 如果不是最后一次尝试,等待后重试 + if attempt < self.max_sync_retries - 1: + logger.info(f"Retrying sync in {self.retry_delay}s...") + time.sleep(self.retry_delay) + + # 所有重试都失败 + self.sync_failures += 1 + logger.error(f"Failed to sync parameters to rollouter after {self.max_sync_retries} attempts") + return False def get_current_version(self) -> int: """获取当前参数版本""" return self.current_version + + def get_statistics(self) -> dict: + """获取统计信息""" + return { + "current_version": self.current_version, + "sync_count": self.sync_count, + "sync_failures": self.sync_failures, + "last_sync_time": self.last_sync_time, + } diff --git a/recipe/fully_async_policy/test_fully_async.py b/recipe/fully_async_policy/test_fully_async.py index eaa9313254a..6332a4dd4d8 100644 --- a/recipe/fully_async_policy/test_fully_async.py +++ b/recipe/fully_async_policy/test_fully_async.py @@ -101,7 +101,7 @@ class TestRollouterComponents(unittest.TestCase): def setUp(self): """设置测试环境""" - from .rollouter import RolloutController + from .fully_async_rollouter import RolloutController self.controller = RolloutController() From 289a4a5833cf5074d892792bb2df25f8f3aaa6c2 Mon Sep 17 00:00:00 2001 From: ArronHZG Date: Thu, 31 Jul 2025 16:06:31 +0800 Subject: [PATCH 012/182] message_queue --- .../fully_async_policy/README_fully_async.md | 6 +- .../config/fully_async_ppo_trainer.yaml | 2 +- recipe/fully_async_policy/fully_async_main.py | 2 +- .../fully_async_rollouter.py | 10 +- .../fully_async_policy/fully_async_trainer.py | 10 +- recipe/fully_async_policy/message_queue.py | 152 +++---- .../run_fully_async_example.sh | 4 +- recipe/fully_async_policy/test_fully_async.py | 4 +- recipe/fully_async_policy/test_mq.py | 343 ---------------- recipe/fully_async_policy/unittest/test_mq.py | 373 ++++++++++++++++++ 10 files changed, 454 insertions(+), 452 deletions(-) delete mode 100644 recipe/fully_async_policy/test_mq.py create mode 100644 recipe/fully_async_policy/unittest/test_mq.py diff --git a/recipe/fully_async_policy/README_fully_async.md b/recipe/fully_async_policy/README_fully_async.md index 4c1866788a5..1708be5ae34 100644 --- a/recipe/fully_async_policy/README_fully_async.md +++ b/recipe/fully_async_policy/README_fully_async.md @@ -96,7 +96,7 @@ ```yaml async_training: # 新鲜度控制 - freshness_threshold: 3 # 样本新鲜度阈值 + staleness_threshold: 3 # 样本新鲜度阈值 max_staleness_allowed: 5 # 最大允许的样本陈旧度 # 队列管理 @@ -144,7 +144,7 @@ python fully_async_main.py --config-path /path/to/config --config-name my_config ```python # 在配置文件中自定义异步训练参数 async_training: - freshness_threshold: 5 + staleness_threshold: 5 max_queue_size: 2000 generation_timeout: 60.0 ``` @@ -196,7 +196,7 @@ async_training: - 调整 `batch_generation_interval` 2. **样本过期严重** - - 调整 `freshness_threshold` + - 调整 `staleness_threshold` - 检查参数同步频率 - 监控 `stale_samples_ratio` diff --git a/recipe/fully_async_policy/config/fully_async_ppo_trainer.yaml b/recipe/fully_async_policy/config/fully_async_ppo_trainer.yaml index 19c4aa01339..d97484d88f4 100644 --- a/recipe/fully_async_policy/config/fully_async_ppo_trainer.yaml +++ b/recipe/fully_async_policy/config/fully_async_ppo_trainer.yaml @@ -10,7 +10,7 @@ defaults: async_training: # 新鲜度控制 (Freshness Control) - freshness_threshold: 3 # 样本新鲜度阈值 + staleness_threshold: 3 # 样本新鲜度阈值 max_staleness_allowed: 5 # 最大允许的样本陈旧度 # 队列管理 (Queue Management) diff --git a/recipe/fully_async_policy/fully_async_main.py b/recipe/fully_async_policy/fully_async_main.py index e57e3e119b7..3773d90d8d7 100644 --- a/recipe/fully_async_policy/fully_async_main.py +++ b/recipe/fully_async_policy/fully_async_main.py @@ -510,7 +510,7 @@ def main(config): # 设置默认异步训练配置 config.async_training = OmegaConf.create( { - "freshness_threshold": 3, + "staleness_threshold": 3, "max_staleness_allowed": 5, "max_queue_size": 1000, "min_batch_count": 1, diff --git a/recipe/fully_async_policy/fully_async_rollouter.py b/recipe/fully_async_policy/fully_async_rollouter.py index 3ece39d0f10..06380803aee 100644 --- a/recipe/fully_async_policy/fully_async_rollouter.py +++ b/recipe/fully_async_policy/fully_async_rollouter.py @@ -154,7 +154,7 @@ def __init__( # 新鲜度控制 - 改进的配置管理 async_config = config.async_training - self.freshness_threshold = async_config.get("freshness_threshold", 3) + self.staleness_threshold = async_config.get("staleness_threshold", 3) self.max_staleness_allowed = async_config.get("max_staleness_allowed", 5) self.generation_timeout = async_config.get("generation_timeout", 30.0) self.batch_generation_interval = async_config.get("batch_generation_interval", 0.1) @@ -190,7 +190,7 @@ def _validate_config(self): required_configs = [ "data.train_batch_size", "actor_rollout_ref.rollout.n", - "async_training.freshness_threshold", + "async_training.staleness_threshold", ] for config_path in required_configs: @@ -428,7 +428,7 @@ def _should_pause_generation(self) -> bool: return True # 如果队列太满,也暂停生成 - max_queue_size = self.freshness_threshold * self.config.data.train_batch_size + max_queue_size = self.staleness_threshold * self.config.data.train_batch_size if queue_size >= max_queue_size: logger.debug(f"Should pause due to full queue: size={queue_size}, max={max_queue_size}") return True @@ -532,9 +532,9 @@ def _generation_loop(self): } # 放入队列 - success = self.message_queue_client.put_batch( + success = self.message_queue_client.put_samples( epoch=epoch, - batch=generated_batch, + sample=generated_batch, param_version=self.current_param_version, rollout_metadata=rollout_metadata, ) diff --git a/recipe/fully_async_policy/fully_async_trainer.py b/recipe/fully_async_policy/fully_async_trainer.py index e66bc895c9c..36687861ae8 100644 --- a/recipe/fully_async_policy/fully_async_trainer.py +++ b/recipe/fully_async_policy/fully_async_trainer.py @@ -23,7 +23,7 @@ from torch.utils.data import Dataset, Sampler from tqdm import tqdm -from recipe.fully_async_policy.message_queue import BatchSample, MessageQueueClient +from recipe.fully_async_policy.message_queue import QueueSample, MessageQueueClient from verl import DataProto from verl.single_controller.ray import RayClassWithInitArgs, RayWorkerGroup from verl.single_controller.ray.base import create_colocated_worker_cls @@ -298,7 +298,7 @@ def _sync_parameters_to_rollouter(self): self.current_param_version -= 1 # 回滚版本号 raise - def _process_batch_samples(self, batch_samples: list[BatchSample]) -> DataProto: + def _process_batch_samples(self, batch_samples: list[QueueSample]) -> DataProto: """处理从队列获取的batch样本 - 改进的批处理逻辑""" if not batch_samples: raise ValueError("Empty batch samples") @@ -316,7 +316,7 @@ def _process_batch_samples(self, batch_samples: list[BatchSample]) -> DataProto: logger.error(f"Failed to merge batch samples: {e}") raise - def _compute_sample_freshness_metrics(self, batch_samples: list[BatchSample]) -> dict: + def _compute_sample_freshness_metrics(self, batch_samples: list[QueueSample]) -> dict: """计算样本新鲜度指标""" sample_ages = [self.current_param_version - sample.param_version for sample in batch_samples] current_time = time.time() @@ -396,8 +396,8 @@ def fit(self): min_batch_count = self.config.async_training.get("min_batch_count", 1) batch_timeout = self.config.async_training.get("batch_timeout", 30.0) - batch_samples = self.message_queue_client.get_batch( - min_batch_count=min_batch_count, timeout=batch_timeout + batch_samples = self.message_queue_client.get_samples( + min_batch=min_batch_count, timeout=batch_timeout ) if batch_samples is None: diff --git a/recipe/fully_async_policy/message_queue.py b/recipe/fully_async_policy/message_queue.py index 58996d4266e..5866dcfd4a9 100644 --- a/recipe/fully_async_policy/message_queue.py +++ b/recipe/fully_async_policy/message_queue.py @@ -18,21 +18,19 @@ import uuid from collections import deque from dataclasses import dataclass -from typing import Any, Optional +from typing import Any, Optional, List import ray -import zmq -from filelock import FileLock from omegaconf import DictConfig logger = logging.getLogger(__name__) @dataclass -class BatchSample: +class QueueSample: """单个batch样本,包含参数版本和新鲜度信息""" - batch_id: str + id: str epoch: int data: Any param_version: int @@ -40,11 +38,10 @@ class BatchSample: rollout_metadata: dict[str, Any] -@ray.remote(num_cpus=1) +@ray.remote(num_cpus=10, max_concurrency=10) class MessageQueue: """ 简化的Ray-based异步消息队列,用于Rollouter和Trainer之间的通信 - 去掉了ZeroMQ的复杂性,使用更可靠的Ray机制 """ def __init__(self, config: DictConfig, max_queue_size: int = 1000): @@ -56,27 +53,17 @@ def __init__(self, config: DictConfig, max_queue_size: int = 1000): # 安全地获取配置值 try: if hasattr(config, "async_training") and config.async_training is not None: - self.freshness_threshold = getattr(config.async_training, "freshness_threshold", 3) + self.staleness_threshold = getattr(config.async_training, "staleness_threshold", 3) else: - self.freshness_threshold = 3 + self.staleness_threshold = 3 except (AttributeError, RecursionError): - self.freshness_threshold = 3 - - # ZeroMQ setup - self.context = None - self.socket = None - self.address = None - try: - self._setup_zmq() - except Exception as e: - print(f"Warning: ZeroMQ setup failed: {e}. Queue will work without ZeroMQ.") + self.staleness_threshold = 3 # Threading for message handling self.running = True # 线程安全 self.lock = threading.RLock() - self.consumer_waiting = False self.consumer_condition = threading.Condition(self.lock) # 统计信息 @@ -86,35 +73,19 @@ def __init__(self, config: DictConfig, max_queue_size: int = 1000): logger.info( f"MessageQueue initialized with max_queue_size={max_queue_size}," - "freshness_threshold={self.freshness_threshold}" + "staleness_threshold={self.staleness_threshold}" ) - def _setup_zmq(self): - """设置ZeroMQ socket""" - with FileLock("/tmp/verl_message_queue.lock"): - # 初始化 ZeroMQ context - self.context = zmq.Context() - - # 使用TCP socket - import socket as sock - - with sock.socket() as s: - s.bind(("", 0)) - port = s.getsockname()[1] - - self.address = f"tcp://127.0.0.1:{port}" - self.socket = self.context.socket(zmq.PAIR) - self.socket.bind(self.address) - - def put_batch(self, epoch: int, batch: Any, param_version: int, rollout_metadata: dict[str, Any] = None) -> bool: + def put_samples(self, epoch: int, samples: List[Any], param_version: int, + rollout_metadata_list: List[dict[str, Any]] = None) -> bool: """ 放入一个batch样本到队列 Args: epoch: 当前epoch - batch: 样本数据 + samples: 样本数据 param_version: 参数版本号 - rollout_metadata: rollout相关的元数据 + rollout_metadata_list: rollout相关的元数据 Returns: bool: 是否成功放入队列 @@ -122,62 +93,67 @@ def put_batch(self, epoch: int, batch: Any, param_version: int, rollout_metadata with self.lock: # 检查新鲜度 staleness = self.current_param_version - param_version - if staleness >= self.freshness_threshold: + if staleness >= self.staleness_threshold: self.dropped_samples += 1 - logger.debug(f"Dropped stale sample: staleness={staleness}, threshold={self.freshness_threshold}") + logger.debug(f"Dropped stale sample: staleness={staleness}, threshold={self.staleness_threshold}") return False - sample = BatchSample( - batch_id=str(uuid.uuid4()), - epoch=epoch, - data=batch, - param_version=param_version, - timestamp=time.time(), - rollout_metadata=rollout_metadata or {}, - ) - - # 如果队列满了,移除最旧的样本 - if len(self.queue) >= self.max_queue_size: - removed = self.queue.popleft() - self.dropped_samples += 1 - logger.warning(f"Queue full, dropped sample {removed.batch_id}") + # 处理 rollout_metadatas 为 None 的情况 + if rollout_metadata_list is None: + rollout_metadata_list = [{}] * len(samples) + + if len(rollout_metadata_list) != len(samples): + logger.warning( + f"len(rollout_metadata_list):{len(rollout_metadata_list)} != len(samples:{len(samples)}") + return False - self.queue.append(sample) - self.total_produced += 1 + for sample, meta in zip(samples, rollout_metadata_list): + queue_sample = QueueSample( + id=str(uuid.uuid4()), + epoch=epoch, + data=sample, + param_version=param_version, + timestamp=time.time(), + rollout_metadata=meta or {}, + ) + + # 如果队列满了,移除最旧的样本,一般不会发生 + if len(self.queue) >= self.max_queue_size: + removed = self.queue.popleft() + self.dropped_samples += 1 + logger.warning(f"Queue full, dropped sample {removed.id}") + + self.queue.append(queue_sample) + self.total_produced += 1 # 通知等待的消费者 - if self.consumer_waiting: - self.consumer_condition.notify() + self.consumer_condition.notify() if self.total_produced % 100 == 0: logger.debug(f"MessageQueue stats: produced={self.total_produced}, queue_size={len(self.queue)}") return True - def get_batch(self, min_batch_count: int = 1, timeout: float = 30.0) -> Optional[list[BatchSample]]: + def get_samples(self, min_batch: int = 1) -> list[QueueSample]: """ - 从队列获取batch样本 + 从队列获取batch样本,一直等待直到有足够样本 Args: - min_batch_count: 最小batch数量 - timeout: 超时时间(秒) + min_batch: sample数量满足min_batch,一次性获取 Returns: - Optional[List[BatchSample]]: 获取的样本列表,如果超时返回None + List[QueueSample]: 获取的样本列表 """ with self.lock: - start_time = time.time() + while len(self.queue) < min_batch and self.running: + self.consumer_condition.wait() - while len(self.queue) < min_batch_count: - if time.time() - start_time > timeout: - return None - - self.consumer_waiting = True - self.consumer_condition.wait(timeout=1.0) - self.consumer_waiting = False + # 如果队列已关闭且没有足够样本,返回空列表 + if not self.running and len(self.queue) < min_batch: + return [] # 获取指定数量的样本 - batch_count = min(min_batch_count, len(self.queue)) + batch_count = min(min_batch, len(self.queue)) samples = [] for _ in range(batch_count): if self.queue: @@ -207,7 +183,7 @@ def get_statistics(self) -> dict[str, Any]: "total_consumed": self.total_consumed, "dropped_samples": self.dropped_samples, "current_param_version": self.current_param_version, - "freshness_threshold": self.freshness_threshold, + "staleness_threshold": self.staleness_threshold, "max_queue_size": self.max_queue_size, } @@ -220,11 +196,11 @@ def clear_queue(self): def shutdown(self): """关闭消息队列""" - self.running = False - if self.socket: - self.socket.close() - if self.context: - self.context.term() + with self.lock: # 修正:需要加锁 + self.running = False + # 通知所有等待的线程,让它们能够退出 + self.consumer_condition.notify_all() + logger.info("MessageQueue shutdown") def get_memory_usage(self) -> dict: """获取内存使用统计""" @@ -254,10 +230,6 @@ def get_memory_usage(self) -> dict: "estimated_memory_mb": total_size / (1024 * 1024), } - def get_address(self) -> str: - """获取ZeroMQ地址""" - return self.address - class MessageQueueClient: """MessageQueue的客户端,用于与MessageQueue Actor通信""" @@ -265,13 +237,13 @@ class MessageQueueClient: def __init__(self, queue_actor: Any): self.queue_actor = queue_actor - def put_batch(self, epoch: int, batch: Any, param_version: int, rollout_metadata: dict[str, Any] = None) -> bool: + def put_batch(self, epoch: int, batch: List[Any], param_version: int, rollout_metadata_list: List[dict[str, Any]] = None) -> bool: """放入batch到队列""" - return ray.get(self.queue_actor.put_batch.remote(epoch, batch, param_version, rollout_metadata)) + return ray.get(self.queue_actor.put_samples.remote(epoch, batch, param_version, rollout_metadata_list)) - def get_batch(self, min_batch_count: int = 1, timeout: float = 30.0) -> Optional[list[BatchSample]]: - """从队列获取batch""" - return ray.get(self.queue_actor.get_batch.remote(min_batch_count, timeout)) + def get_batch(self, min_batch_count: int = 1) -> list[QueueSample]: + """从队列获取batch,一直等待直到有足够样本""" + return ray.get(self.queue_actor.get_samples.remote(min_batch_count)) def update_param_version(self, version: int): """更新参数版本""" diff --git a/recipe/fully_async_policy/run_fully_async_example.sh b/recipe/fully_async_policy/run_fully_async_example.sh index d58e4ecc771..180071318a1 100644 --- a/recipe/fully_async_policy/run_fully_async_example.sh +++ b/recipe/fully_async_policy/run_fully_async_example.sh @@ -54,7 +54,7 @@ max_prompt_length=1024 max_response_length=1024 # 异步训练参数 -freshness_threshold=3 +staleness_threshold=3 max_staleness_allowed=5 max_queue_size=1000 min_batch_count=1 @@ -120,7 +120,7 @@ python -m recipe.one_step_off_policy.fully_async_main \ critic.fsdp_config.param_offload=false \ \ # 异步训练配置 - async_training.freshness_threshold=$freshness_threshold \ + async_training.staleness_threshold=$staleness_threshold \ async_training.max_staleness_allowed=$max_staleness_allowed \ async_training.max_queue_size=$max_queue_size \ async_training.min_batch_count=$min_batch_count \ diff --git a/recipe/fully_async_policy/test_fully_async.py b/recipe/fully_async_policy/test_fully_async.py index 6332a4dd4d8..c138debcaa0 100644 --- a/recipe/fully_async_policy/test_fully_async.py +++ b/recipe/fully_async_policy/test_fully_async.py @@ -40,7 +40,7 @@ def setUp(self): config = OmegaConf.create( { "async_training": { - "freshness_threshold": 3, + "staleness_threshold": 3, "max_staleness_allowed": 5, } } @@ -147,7 +147,7 @@ def test_integration(): config = OmegaConf.create( { "async_training": { - "freshness_threshold": 3, + "staleness_threshold": 3, "max_staleness_allowed": 5, } } diff --git a/recipe/fully_async_policy/test_mq.py b/recipe/fully_async_policy/test_mq.py deleted file mode 100644 index 3659911319e..00000000000 --- a/recipe/fully_async_policy/test_mq.py +++ /dev/null @@ -1,343 +0,0 @@ -# Copyright 2025 Meituan Ltd. and/or its affiliates -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import threading -import time -from unittest.mock import Mock - -import pytest -import ray -from message_queue import BatchSample, MessageQueue, MessageQueueClient -from omegaconf import DictConfig - - -@pytest.fixture -def mock_data_proto(): - """Mock DataProto对象""" - return Mock() - - -@pytest.fixture -def basic_config(): - """基础配置""" - return DictConfig({"async_training": {"freshness_threshold": 3}}) - - -@pytest.fixture -def queue_config(): - """队列配置""" - return DictConfig({"async_training": {"freshness_threshold": 2}}) - - -class TestBatchSample: - """测试BatchSample数据类""" - - def test_batch_sample_creation(self, mock_data_proto): - """测试BatchSample创建""" - sample = BatchSample( - batch_id="test-123", - epoch=1, - data=mock_data_proto, - param_version=5, - timestamp=1234567890.0, - rollout_metadata={"key": "value"}, - ) - - assert sample.batch_id == "test-123" - assert sample.epoch == 1 - assert sample.data == mock_data_proto - assert sample.param_version == 5 - assert sample.timestamp == 1234567890.0 - assert sample.rollout_metadata == {"key": "value"} - - -class TestMessageQueue: - """测试MessageQueue类(需要在非Ray环境下测试内部逻辑)""" - - def test_message_queue_init(self, basic_config): - """测试MessageQueue初始化""" - # 直接创建MessageQueue实例(不使用Ray装饰器) - queue = MessageQueue.__ray_actor_class__(basic_config, max_queue_size=100) - - # 确保ZeroMQ初始化成功 - assert queue.context is not None - assert queue.socket is not None - - # 基本属性检查 - assert queue.max_queue_size == 100 - assert queue.current_param_version == 0 - assert queue.freshness_threshold == 3 - assert len(queue.queue) == 0 - assert queue.total_produced == 0 - assert queue.total_consumed == 0 - assert queue.dropped_samples == 0 - - # 清理资源 - queue.shutdown() - - -@pytest.fixture -def ray_setup(): - """设置Ray环境""" - if not ray.is_initialized(): - ray.init(local_mode=True, ignore_reinit_error=True) - yield - ray.shutdown() - - -@pytest.fixture -def message_queue_actor(ray_setup, basic_config): - """创建MessageQueue actor""" - actor = MessageQueue.remote(basic_config, max_queue_size=10) - yield actor - ray.get(actor.shutdown.remote()) - - -class TestMessageQueueActor: - """测试MessageQueue Actor""" - - def test_put_batch_success(self, message_queue_actor, mock_data_proto): - """测试成功放入batch""" - result = ray.get( - message_queue_actor.put_batch.remote( - epoch=1, batch=mock_data_proto, param_version=1, rollout_metadata={"test": "data"} - ) - ) - - assert result is True - - # 检查队列大小 - queue_size = ray.get(message_queue_actor.get_queue_size.remote()) - assert queue_size == 1 - - # 检查统计信息 - stats = ray.get(message_queue_actor.get_statistics.remote()) - assert stats["total_produced"] == 1 - assert stats["queue_size"] == 1 - - def test_put_batch_staleness_check(self, message_queue_actor, mock_data_proto): - """测试新鲜度检查""" - # 更新参数版本为5 - ray.get(message_queue_actor.update_param_version.remote(5)) - - # 尝试放入版本过旧的batch(版本差异>=3会被拒绝) - result = ray.get( - message_queue_actor.put_batch.remote( - epoch=1, - batch=mock_data_proto, - param_version=2, # 5-2=3, 达到阈值 - rollout_metadata={}, - ) - ) - - assert result is False - - # 检查统计信息中的丢弃样本数 - stats = ray.get(message_queue_actor.get_statistics.remote()) - assert stats["dropped_samples"] == 1 - - def test_put_batch_queue_overflow(self, message_queue_actor, mock_data_proto): - """测试队列溢出处理""" - # 填满队列(最大容量10) - for i in range(12): # 超过最大容量 - ray.get( - message_queue_actor.put_batch.remote( - epoch=1, batch=mock_data_proto, param_version=1, rollout_metadata={} - ) - ) - - # 队列大小应该保持在最大值 - queue_size = ray.get(message_queue_actor.get_queue_size.remote()) - assert queue_size == 10 - - # 检查统计信息 - stats = ray.get(message_queue_actor.get_statistics.remote()) - assert stats["dropped_samples"] == 2 # 超出的2个被丢弃 - - def test_get_batch_success(self, message_queue_actor, mock_data_proto): - """测试成功获取batch""" - # 先放入一些batch - for i in range(3): - ray.get( - message_queue_actor.put_batch.remote( - epoch=i, batch=mock_data_proto, param_version=1, rollout_metadata={"index": i} - ) - ) - - # 获取2个batch - samples = ray.get(message_queue_actor.get_batch.remote(min_batch_count=2, timeout=5.0)) - - assert samples is not None - assert len(samples) == 2 - assert all(isinstance(sample, BatchSample) for sample in samples) - - # 检查队列大小减少 - queue_size = ray.get(message_queue_actor.get_queue_size.remote()) - assert queue_size == 1 - - # 检查统计信息 - stats = ray.get(message_queue_actor.get_statistics.remote()) - assert stats["total_consumed"] == 2 - - def test_get_batch_timeout(self, message_queue_actor): - """测试获取batch超时""" - # 空队列情况下获取batch应该超时 - samples = ray.get(message_queue_actor.get_batch.remote(min_batch_count=1, timeout=1.0)) - assert samples is None - - def test_update_param_version(self, message_queue_actor): - """测试更新参数版本""" - ray.get(message_queue_actor.update_param_version.remote(10)) - - stats = ray.get(message_queue_actor.get_statistics.remote()) - assert stats["current_param_version"] == 10 - - def test_clear_queue(self, message_queue_actor, mock_data_proto): - """测试清空队列""" - # 先添加一些样本 - for i in range(3): - ray.get(message_queue_actor.put_batch.remote(epoch=i, batch=mock_data_proto, param_version=1)) - - # 清空队列 - ray.get(message_queue_actor.clear_queue.remote()) - - # 检查队列大小 - queue_size = ray.get(message_queue_actor.get_queue_size.remote()) - assert queue_size == 0 - - def test_get_statistics(self, message_queue_actor): - """测试获取统计信息""" - stats = ray.get(message_queue_actor.get_statistics.remote()) - - expected_keys = { - "queue_size", - "total_produced", - "total_consumed", - "dropped_samples", - "current_param_version", - "freshness_threshold", - "max_queue_size", - } - assert set(stats.keys()) == expected_keys - assert isinstance(stats["queue_size"], int) - assert isinstance(stats["total_produced"], int) - assert isinstance(stats["total_consumed"], int) - - -class TestMessageQueueClient: - """测试MessageQueueClient""" - - def test_client_put_batch(self, message_queue_actor, mock_data_proto): - """测试客户端放入batch""" - client = MessageQueueClient(message_queue_actor) - - result = client.put_batch(epoch=1, batch=mock_data_proto, param_version=1, rollout_metadata={"test": "client"}) - - assert result is True - assert client.get_queue_size() == 1 - - def test_client_get_batch(self, message_queue_actor, mock_data_proto): - """测试客户端获取batch""" - client = MessageQueueClient(message_queue_actor) - - # 先放入一个batch - client.put_batch(epoch=1, batch=mock_data_proto, param_version=1) - - # 获取batch - samples = client.get_batch(min_batch_count=1, timeout=5.0) - - assert samples is not None - assert len(samples) == 1 - assert isinstance(samples[0], BatchSample) - - def test_client_update_param_version(self, message_queue_actor): - """测试客户端更新参数版本""" - client = MessageQueueClient(message_queue_actor) - - client.update_param_version(15) - - stats = client.get_statistics() - assert stats["current_param_version"] == 15 - - def test_client_get_queue_size(self, message_queue_actor, mock_data_proto): - """测试客户端获取队列大小""" - client = MessageQueueClient(message_queue_actor) - - assert client.get_queue_size() == 0 - - client.put_batch(epoch=1, batch=mock_data_proto, param_version=1) - assert client.get_queue_size() == 1 - - def test_client_clear_queue(self, message_queue_actor, mock_data_proto): - """测试客户端清空队列""" - client = MessageQueueClient(message_queue_actor) - - # 添加样本 - client.put_batch(epoch=1, batch=mock_data_proto, param_version=1) - assert client.get_queue_size() == 1 - - # 清空队列 - client.clear_queue() - assert client.get_queue_size() == 0 - - def test_client_shutdown(self, message_queue_actor): - """测试客户端关闭""" - client = MessageQueueClient(message_queue_actor) - - # 关闭不应该抛出异常 - client.shutdown() - - -class TestConcurrency: - """测试并发场景""" - - def test_concurrent_put_get(self, message_queue_actor, mock_data_proto): - """测试并发放入和获取""" - client = MessageQueueClient(message_queue_actor) - results = [] - - def producer(): - for i in range(5): - result = client.put_batch(epoch=i, batch=mock_data_proto, param_version=1) - results.append(("put", result)) - time.sleep(0.1) - - def consumer(): - for _ in range(3): - samples = client.get_batch(min_batch_count=1, timeout=2.0) - results.append(("get", samples is not None)) - time.sleep(0.1) - - # 启动生产者和消费者线程 - producer_thread = threading.Thread(target=producer) - consumer_thread = threading.Thread(target=consumer) - - producer_thread.start() - time.sleep(0.05) # 让生产者先开始 - consumer_thread.start() - - producer_thread.join() - consumer_thread.join() - - # 检查结果 - put_results = [r[1] for r in results if r[0] == "put"] - get_results = [r[1] for r in results if r[0] == "get"] - - assert all(put_results) # 所有放入操作都应该成功 - assert all(get_results) # 所有获取操作都应该成功 - - -# 运行测试的示例配置 -if __name__ == "__main__": - pytest.main([__file__, "-v", "--tb=short"]) diff --git a/recipe/fully_async_policy/unittest/test_mq.py b/recipe/fully_async_policy/unittest/test_mq.py new file mode 100644 index 00000000000..dbc29c3e9ce --- /dev/null +++ b/recipe/fully_async_policy/unittest/test_mq.py @@ -0,0 +1,373 @@ +# Copyright 2025 Meituan Ltd. and/or its affiliates +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import os +import threading +import time +from unittest.mock import Mock + +import pytest +import ray +from recipe.fully_async_policy.message_queue import QueueSample, MessageQueue, MessageQueueClient +from omegaconf import DictConfig + + +@pytest.fixture +def mock_data_proto(): + """Mock数据对象""" + return Mock() + + +@pytest.fixture +def basic_config(): + """基础配置""" + return DictConfig({"async_training": {"staleness_threshold": 3}}) + + +@pytest.fixture +def queue_config(): + """队列配置""" + return DictConfig({"async_training": {"staleness_threshold": 2}}) + + +@pytest.fixture +def ray_setup(): + """设置Ray环境""" + if not ray.is_initialized(): + ray.init(local_mode=True, ignore_reinit_error=True) + yield + ray.shutdown() + + +@pytest.fixture +def message_queue_client(ray_setup, basic_config): + """创建MessageQueue actor并返回其客户端""" + actor = MessageQueue.remote(basic_config, max_queue_size=10) + client = MessageQueueClient(actor) + yield client + client.shutdown() + + +class TestMessageQueue: + """测试MessageQueue(通过MessageQueueClient)""" + + def test_put_samples_success(self, message_queue_client, mock_data_proto): + """测试成功放入samples""" + samples = [mock_data_proto, mock_data_proto] + metadata_list = [{"test": "data1"}, {"test": "data2"}] + + result = message_queue_client.put_batch( + epoch=1, + batch=samples, + param_version=1, + rollout_metadata_list=metadata_list + ) + + assert result is True + + # 检查队列大小 + queue_size = message_queue_client.get_queue_size() + assert queue_size == 2 + + # 检查统计信息 + stats = message_queue_client.get_statistics() + assert stats["total_produced"] == 2 + assert stats["queue_size"] == 2 + + def test_put_samples_without_metadata(self, message_queue_client, mock_data_proto): + """测试不提供metadata时的处理""" + samples = [mock_data_proto, mock_data_proto] + + result = message_queue_client.put_batch( + epoch=1, + batch=samples, + param_version=1, + rollout_metadata_list=None + ) + + assert result is True + queue_size = message_queue_client.get_queue_size() + assert queue_size == 2 + + def test_put_samples_metadata_mismatch(self, message_queue_client, mock_data_proto): + """测试metadata长度不匹配的处理""" + samples = [mock_data_proto, mock_data_proto] + metadata_list = [{"test": "data1"}] # 长度不匹配 + + result = message_queue_client.put_batch( + epoch=1, + batch=samples, + param_version=1, + rollout_metadata_list=metadata_list + ) + + assert result is False # 应该失败 + queue_size = message_queue_client.get_queue_size() + assert queue_size == 0 + + def test_put_samples_staleness_check(self, message_queue_client, mock_data_proto): + """测试新鲜度检查""" + # 更新参数版本为5 + message_queue_client.update_param_version(5) + + # 尝试放入版本过旧的batch(版本差异>=3会被拒绝) + samples = [mock_data_proto] + result = message_queue_client.put_batch( + epoch=1, + batch=samples, + param_version=2, # 5-2=3, 达到阈值 + rollout_metadata_list=None + ) + + assert result is False + + # 检查统计信息中的丢弃样本数 + stats = message_queue_client.get_statistics() + assert stats["dropped_samples"] == 1 + + def test_put_samples_queue_overflow(self, message_queue_client, mock_data_proto): + """测试队列溢出处理""" + # 填满队列(最大容量10) + for i in range(6): # 每次放入2个,总共12个,超过最大容量10 + samples = [mock_data_proto, mock_data_proto] + message_queue_client.put_batch( + epoch=1, + batch=samples, + param_version=1, + rollout_metadata_list=None + ) + + # 队列大小应该保持在最大值 + queue_size = message_queue_client.get_queue_size() + assert queue_size == 10 + + # 检查统计信息 + stats = message_queue_client.get_statistics() + assert stats["dropped_samples"] == 2 # 超出的2个被丢弃 + + def test_get_samples_success(self, message_queue_client, mock_data_proto): + """测试成功获取samples""" + # 先放入一些samples + samples = [mock_data_proto, mock_data_proto, mock_data_proto] + metadata_list = [{"index": 0}, {"index": 1}, {"index": 2}] + message_queue_client.put_batch( + epoch=1, + batch=samples, + param_version=1, + rollout_metadata_list=metadata_list + ) + + # 获取2个samples + retrieved_samples = message_queue_client.get_batch(min_batch_count=2) + + assert retrieved_samples is not None + assert len(retrieved_samples) == 2 + assert all(isinstance(sample, QueueSample) for sample in retrieved_samples) + + # 检查队列大小减少 + queue_size = message_queue_client.get_queue_size() + assert queue_size == 1 + + # 检查统计信息 + stats = message_queue_client.get_statistics() + assert stats["total_consumed"] == 2 + + def test_get_samples_blocking_behavior(self, message_queue_client, mock_data_proto): + """测试阻塞行为""" + result = [] + + def get_samples(): + # 这会阻塞直到有足够样本 + samples = message_queue_client.get_batch(min_batch_count=2) + result.append(samples) + + def put_samples_later(): + time.sleep(0.5) # 延迟放入 + samples = [mock_data_proto, mock_data_proto] + message_queue_client.put_batch( + epoch=1, + batch=samples, + param_version=1, + rollout_metadata_list=None + ) + + # 启动消费者线程 + consumer_thread = threading.Thread(target=get_samples) + producer_thread = threading.Thread(target=put_samples_later) + + consumer_thread.start() + producer_thread.start() + + # 等待两个线程完成 + producer_thread.join(timeout=2) + consumer_thread.join(timeout=2) + + assert len(result) == 1 + assert len(result[0]) == 2 + + def test_update_param_version(self, message_queue_client): + """测试更新参数版本""" + message_queue_client.update_param_version(10) + stats = message_queue_client.get_statistics() + assert stats["current_param_version"] == 10 + + def test_clear_queue(self, message_queue_client, mock_data_proto): + """测试清空队列""" + # 先添加一些样本 + samples = [mock_data_proto, mock_data_proto, mock_data_proto] + message_queue_client.put_batch( + epoch=1, + batch=samples, + param_version=1, + rollout_metadata_list=None + ) + + # 清空队列 + message_queue_client.clear_queue() + + # 检查队列大小 + queue_size = message_queue_client.get_queue_size() + assert queue_size == 0 + + def test_get_queue_size(self, message_queue_client, mock_data_proto): + """测试获取队列大小""" + assert message_queue_client.get_queue_size() == 0 + + samples = [mock_data_proto] + message_queue_client.put_batch( + epoch=1, + batch=samples, + param_version=1, + rollout_metadata_list=None + ) + assert message_queue_client.get_queue_size() == 1 + + def test_get_statistics(self, message_queue_client): + """测试获取统计信息""" + stats = message_queue_client.get_statistics() + + expected_keys = { + "queue_size", + "total_produced", + "total_consumed", + "dropped_samples", + "current_param_version", + "staleness_threshold", + "max_queue_size", + } + assert set(stats.keys()) == expected_keys + assert isinstance(stats["queue_size"], int) + assert isinstance(stats["total_produced"], int) + assert isinstance(stats["total_consumed"], int) + + def test_get_memory_usage(self, message_queue_client, mock_data_proto): + """测试获取内存使用统计""" + # 添加一些样本 + samples = [mock_data_proto, mock_data_proto] + message_queue_client.put_batch( + epoch=1, + batch=samples, + param_version=1, + rollout_metadata_list=None + ) + + memory_stats = message_queue_client.get_memory_usage() + + expected_keys = {"queue_samples", "estimated_memory_bytes", "estimated_memory_mb"} + assert set(memory_stats.keys()) == expected_keys + assert memory_stats["queue_samples"] == 2 + assert memory_stats["estimated_memory_bytes"] > 0 + assert memory_stats["estimated_memory_mb"] > 0 + + def test_shutdown(self, ray_setup, basic_config): + """测试关闭功能""" + # 创建新的actor用于测试关闭 + actor = MessageQueue.remote(basic_config, max_queue_size=10) + client = MessageQueueClient(actor) + + # 关闭应该不抛出异常 + client.shutdown() + + +class TestConcurrency: + """测试并发场景""" + + def setup_method(self): + """每个测试方法前的设置""" + if not ray.is_initialized(): + ray.init(local_mode=True, ignore_reinit_error=True) + + def teardown_method(self): + """每个测试方法后的清理""" + if ray.is_initialized(): + ray.shutdown() + + def create_message_queue_client(self, config=None): + """创建MessageQueue client的辅助方法""" + if config is None: + config = DictConfig({"async_training": {"staleness_threshold": 3}}) + actor = MessageQueue.remote(config, max_queue_size=10) + return MessageQueueClient(actor) + + def test_concurrent_put_get(self, mock_data_proto): + """测试并发放入和获取""" + client = self.create_message_queue_client() + try: + results = [] + + def producer(): + for i in range(50): + samples = [mock_data_proto, mock_data_proto] + result = client.put_batch( + epoch=i, + batch=samples, + param_version=1, + rollout_metadata_list=None + ) + results.append(("put", result)) + time.sleep(0.1) + + def consumer(): + for _ in range(100): + try: + retrieved_samples = client.get_batch(min_batch_count=1) + results.append(("get", len(retrieved_samples) > 0)) + except Exception as e: + print(e) + results.append(("get", False)) + time.sleep(0.1) + + # 启动生产者和消费者线程 + producer_thread = threading.Thread(target=producer) + consumer_thread = threading.Thread(target=consumer) + + producer_thread.start() + time.sleep(0.05) + consumer_thread.start() + + producer_thread.join(timeout=5) + consumer_thread.join(timeout=5) + + # 检查结果 + put_results = [r[1] for r in results if r[0] == "put"] + get_results = [r[1] for r in results if r[0] == "get"] + + assert all(put_results) + assert all(get_results) + finally: + client.shutdown() + + +# 运行测试的示例配置 +if __name__ == "__main__": + pytest.main([__file__, "-v", "--tb=short"]) From a89991cb1f6ef6178a7327b2a63f00a78d461ff1 Mon Sep 17 00:00:00 2001 From: ArronHZG Date: Thu, 31 Jul 2025 19:26:25 +0800 Subject: [PATCH 013/182] train --- .../fully_async_policy/fully_async_trainer.py | 461 ++++++++---------- recipe/one_step_off_policy/ray_trainer.py | 11 +- 2 files changed, 221 insertions(+), 251 deletions(-) diff --git a/recipe/fully_async_policy/fully_async_trainer.py b/recipe/fully_async_policy/fully_async_trainer.py index 36687861ae8..9122a97c8fa 100644 --- a/recipe/fully_async_policy/fully_async_trainer.py +++ b/recipe/fully_async_policy/fully_async_trainer.py @@ -14,6 +14,7 @@ import logging import time +import warnings from pprint import pprint import numpy as np @@ -59,24 +60,45 @@ class FullyAsyncTrainer(RayPPOTrainer): """ def __init__( - self, - config, - tokenizer, - role_worker_mapping: dict[Role, WorkerType], - resource_pool_manager: ResourcePoolManager, - ray_worker_group_cls: RayWorkerGroup = RayWorkerGroup, - processor=None, - reward_fn=None, - val_reward_fn=None, - train_dataset: Dataset | None = None, - val_dataset: Dataset | None = None, - collate_fn=None, - train_sampler: Sampler | None = None, - device_name="cuda", + self, + config, + tokenizer, + role_worker_mapping: dict[Role, WorkerType], + resource_pool_manager: ResourcePoolManager, + ray_worker_group_cls: RayWorkerGroup = RayWorkerGroup, + processor=None, + reward_fn=None, + val_reward_fn=None, + train_dataset: Optional[Dataset] = None, + val_dataset: Optional[Dataset] = None, + collate_fn=None, + train_sampler: Optional[Sampler] = None, + device_name=None, ): - self.config = config + """ + Initialize distributed PPO trainer with Ray backend. + Note that this trainer runs on the driver process on a single CPU/GPU node. + + Args: + config: Configuration object containing training parameters. + tokenizer: Tokenizer used for encoding and decoding text. + role_worker_mapping (dict[Role, WorkerType]): Mapping from roles to worker classes. + resource_pool_manager (ResourcePoolManager): Manager for Ray resource pools. + ray_worker_group_cls (RayWorkerGroup, optional): Class for Ray worker groups. Defaults to RayWorkerGroup. + processor: Optional data processor, used for multimodal data + reward_fn: Function for computing rewards during training. + val_reward_fn: Function for computing rewards during validation. + train_dataset (Optional[Dataset], optional): Training dataset. Defaults to None. + val_dataset (Optional[Dataset], optional): Validation dataset. Defaults to None. + collate_fn: Function to collate data samples into batches. + train_sampler (Optional[Sampler], optional): Sampler for the training dataset. Defaults to None. + device_name (str, optional): Device name for training (e.g., "cuda", "cpu"). Defaults to None. + """ + + # Store the tokenizer for text processing self.tokenizer = tokenizer self.processor = processor + self.config = config self.reward_fn = reward_fn self.val_reward_fn = val_reward_fn @@ -85,87 +107,55 @@ def __init__( self.role_worker_mapping = role_worker_mapping self.resource_pool_manager = resource_pool_manager - self.ray_worker_group_cls = ray_worker_group_cls - self.device_name = device_name - self.validation_generations_logger = ValidationGenerationsLogger() - - # 数据相关 - self.train_dataset = train_dataset - self.val_dataset = val_dataset - self.collate_fn = collate_fn - self.train_sampler = train_sampler - - # 角色配置 - 参考OneStepOffRayTrainer的配置 self.use_reference_policy = Role.RefPolicy in role_worker_mapping self.use_rm = Role.RewardModel in role_worker_mapping + self.ray_worker_group_cls = ray_worker_group_cls + self.device_name = device_name if device_name else self.config.trainer.device + self.validation_generations_logger = ValidationGenerationsLogger( + project_name=self.config.trainer.project_name, + experiment_name=self.config.trainer.experiment_name, + ) + + # if ref_in_actor is True, the reference policy will be actor without lora applied self.ref_in_actor = config.actor_rollout_ref.model.get("lora_rank", 0) > 0 - # KL控制器 - if config.algorithm.use_kl_in_reward: - self.kl_ctrl_in_reward = core_algos.get_kl_controller(config.algorithm.kl_ctrl) + # define in-reward KL control + # kl loss control currently not suppoorted + if self.config.algorithm.use_kl_in_reward: + self.kl_ctrl_in_reward = core_algos.get_kl_controller(self.config.algorithm.kl_ctrl) - # 确定是否使用critic - 参考OneStepOffRayTrainer的逻辑 - if self.config.algorithm.adv_estimator == AdvantageEstimator.GAE: + if config.critic.enable is not None: + self.use_critic = bool(config.critic.enable) + elif self.config.algorithm.adv_estimator == AdvantageEstimator.GAE: self.use_critic = True - elif self.config.algorithm.adv_estimator in [ - AdvantageEstimator.GRPO, - AdvantageEstimator.GRPO_PASSK, - AdvantageEstimator.REINFORCE_PLUS_PLUS, - # AdvantageEstimator.REMAX, # TODO:REMAX advantage estimator is not yet supported in one_step_off_policy - AdvantageEstimator.RLOO, - AdvantageEstimator.OPO, - AdvantageEstimator.REINFORCE_PLUS_PLUS_BASELINE, - AdvantageEstimator.GPG, - ]: - self.use_critic = False else: - raise NotImplementedError(f"Unsupported advantage estimator: {self.config.algorithm.adv_estimator}") - - # Worker groups - self.actor_wg = None - self.critic_wg = None - self.ref_policy_wg = None - self.rm_wg = None - - # 训练状态 - self.global_steps = 0 - self.current_param_version = 0 - self.total_training_steps = config.trainer.total_training_steps - - # MessageQueue客户端 - self.message_queue_client = None - - # 与Rollouter的通信 - self.rollouter_actor = None - - # 统计信息 - self.processed_samples = 0 - self.stale_samples_processed = 0 - self.param_sync_count = 0 + warnings.warn( + "Disabled critic as algorithm.adv_estimator != gae. " + "If it is not intended, please set critic.enable=True", + stacklevel=2, + ) + self.use_critic = False self._validate_config() - - def _validate_config(self): - """验证配置""" - required_configs = ["trainer.total_training_steps", "algorithm.adv_estimator", "data.train_batch_size"] - - for config_path in required_configs: - if not OmegaConf.select(self.config, config_path): - raise ValueError(f"Missing required config: {config_path}") + self._create_dataloader(train_dataset, val_dataset, collate_fn, train_sampler) def set_message_queue_client(self, message_queue_client: MessageQueueClient): """设置消息队列客户端""" self.message_queue_client = message_queue_client - def set_rollouter_actor(self, rollouter_actor): - """设置Rollouter Actor的引用""" - self.rollouter_actor = rollouter_actor + def _validate(self): + """执行验证 - 参考OneStepOffRayTrainer的验证逻辑""" + return None def init_workers(self): - """初始化训练workers - 参考OneStepOffRayTrainer的实现""" - logger.info("Initializing FullyAsyncTrainer workers...") + """Initialize distributed training workers using Ray backend. + Creates: + 1. Ray resource pools from configuration + 2. Worker groups for each role (actor, critic, etc.) + """ self.resource_pool_manager.create_resource_pool() + self.resource_pool_to_cls = {pool: {} for pool in self.resource_pool_manager.resource_pool_dict.values()} # 创建actor worker @@ -244,36 +234,6 @@ def init_workers(self): logger.info("FullyAsyncTrainer workers initialized successfully") - def _load_checkpoint(self): - """加载检查点""" - # TODO: 实现检查点加载逻辑 - logger.info("Checkpoint loading not implemented yet") - - def _validate(self): - """执行验证 - 参考OneStepOffRayTrainer的验证逻辑""" - if self.val_reward_fn is None: - return None - - # TODO: 实现完整的验证逻辑 - logger.info("Running validation...") - val_metrics = {"val_reward": 0.0} # 简化的验证指标 - return val_metrics - - def _save_checkpoint(self): - """保存检查点""" - # TODO: 实现检查点保存逻辑 - logger.info("Checkpoint saving not implemented yet") - - def _dump_generations(self, inputs, outputs, scores, reward_extra_infos_dict, dump_path): - """保存生成结果""" - # TODO: 实现生成结果保存逻辑 - logger.debug(f"Dumping generations to {dump_path}") - - def _balance_batch(self, batch: DataProto, metrics: dict): - """平衡batch中的有效token数量 - 参考OneStepOffRayTrainer的实现""" - # TODO: 实现batch平衡逻辑 - pass - def _sync_parameters_to_rollouter(self): """同步参数到Rollouter - 改进的同步机制""" if self.rollouter_actor is None: @@ -332,12 +292,17 @@ def _compute_sample_freshness_metrics(self, batch_samples: list[QueueSample]) -> } def fit(self): - """主训练循环 - 基于OneStepOffRayTrainer的成熟实现""" + """ + The training loop of PPO. + The driver process only need to call the compute functions of the worker group through RPC + to construct the PPO dataflow. + The light-weight advantage computation is done on the driver process. + """ from omegaconf import OmegaConf from verl.utils.tracking import Tracking - logger_tracker = Tracking( + logger = Tracking( project_name=self.config.trainer.project_name, experiment_name=self.config.trainer.experiment_name, default_backend=self.config.trainer.logger, @@ -346,101 +311,90 @@ def fit(self): self.global_steps = 0 - # 加载检查点 + # load checkpoint before doing anything self._load_checkpoint() - # 初始验证 + # perform validation before training + # currently, we only support validation using the reward_function. if self.val_reward_fn is not None and self.config.trainer.get("val_before_train", True): val_metrics = self._validate() - if val_metrics: - pprint(f"Initial validation metrics: {val_metrics}") - logger_tracker.log(data=val_metrics, step=self.global_steps) + assert val_metrics, f"{val_metrics=}" + pprint(f"Initial validation metrics: {val_metrics}") + logger.log(data=val_metrics, step=self.global_steps) if self.config.trainer.get("val_only", False): return - # 进度条 - progress_bar = tqdm(total=self.total_training_steps, initial=self.global_steps, desc="Async Training") + # add tqdm + progress_bar = tqdm(total=self.total_training_steps, initial=self.global_steps, desc="Training Progress") + # we start from step 1 self.global_steps += 1 last_val_metrics = None - if self.message_queue_client is None: - raise ValueError("MessageQueue client not set. Call set_message_queue_client() first.") + # across epoch iterator + continuous_iterator = self._create_continuous_iterator() - logger.info("Starting fully async training loop...") + # Start the first asynchronous generation task. + batch_data_future = self._async_gen_next_batch(continuous_iterator) + + while batch_data_future is not None: + metrics = {} + timing_raw = {} - while self.global_steps <= self.total_training_steps: - # 性能分析 do_profile = ( self.global_steps in self.config.trainer.profile_steps if self.config.trainer.profile_steps is not None else False ) + with marked_timer("start_profile", timing_raw): + self._start_profiling(do_profile) - if do_profile: - self.actor_wg.start_profile() - if self.use_reference_policy and not self.ref_in_actor: - self.ref_policy_wg.start_profile() - if self.use_critic: - self.critic_wg.start_profile() - if self.use_rm: - self.rm_wg.start_profile() - - metrics = {} - timing_raw = {} is_last_step = self.global_steps >= self.total_training_steps with marked_timer("step", timing_raw): - # 从队列获取样本 - with marked_timer("get_batch_from_queue", timing_raw, color="blue"): - min_batch_count = self.config.async_training.get("min_batch_count", 1) - batch_timeout = self.config.async_training.get("batch_timeout", 30.0) - - batch_samples = self.message_queue_client.get_samples( - min_batch=min_batch_count, timeout=batch_timeout - ) - - if batch_samples is None: - logger.warning("Timeout waiting for batch samples, retrying...") - time.sleep(1.0) - continue - - # 处理获取的样本 - with marked_timer("process_batch_samples", timing_raw, color="cyan"): - batch = self._process_batch_samples(batch_samples) - - # 计算样本新鲜度指标 - freshness_metrics = self._compute_sample_freshness_metrics(batch_samples) - metrics.update(freshness_metrics) - - logger.info( - f"Processing batch: {len(batch_samples)} samples, " - f"avg_age={freshness_metrics['freshness/avg_sample_age']:.1f}, " - f"max_age={freshness_metrics['freshness/max_sample_age']}" - ) - - # 添加响应掩码 - 参考OneStepOffRayTrainer - batch.batch["response_mask"] = compute_response_mask(batch) - - # 平衡batch + # wait for the previous batch + with marked_timer("wait_prev_gen", timing_raw, color="red"): + epoch, batch, gen_batch_output = batch_data_future.get() + timing_raw.update(gen_batch_output.meta_info["timing"]) + gen_batch_output.meta_info.pop("timing", None) + + # asys next generation (with syns weights from actor to rollout) + with marked_timer("sync_rollout_weights", timing_raw, color="purple"): + if not is_last_step: + batch_data_future = self._async_gen_next_batch(continuous_iterator) + + batch.non_tensor_batch["uid"] = np.array( + [str(uuid.uuid4()) for _ in range(len(batch.batch))], dtype=object + ) + # repeat to align with repeated responses in rollout + batch = batch.repeat(repeat_times=self.config.actor_rollout_ref.rollout.n, interleave=True) + batch = batch.union(gen_batch_output) + + if "response_mask" not in batch.batch.keys(): + batch.batch["response_mask"] = compute_response_mask(batch) + # Balance the number of valid tokens across DP ranks. + # NOTE: This usually changes the order of data in the `batch`, + # which won't affect the advantage calculation (since it's based on uid), + # but might affect the loss calculation (due to the change of mini-batching). + # TODO: Decouple the DP balancing and mini-batching. if self.config.trainer.balance_batch: self._balance_batch(batch, metrics=metrics) - # 计算全局有效token数量 + # compute global_valid tokens batch.meta_info["global_token_num"] = torch.sum(batch.batch["attention_mask"], dim=-1).tolist() - # 计算奖励 - 参考OneStepOffRayTrainer的实现 with marked_timer("reward", timing_raw, color="yellow"): + # compute reward model score if self.use_rm: reward_tensor = self.rm_wg.compute_rm_score(batch) batch = batch.union(reward_tensor) - if self.config.reward_model.get("launch_reward_fn_async", False): + if self.config.reward_model.launch_reward_fn_async: future_reward = compute_reward_async.remote(batch, self.config, self.tokenizer) else: reward_tensor, reward_extra_infos_dict = compute_reward(batch, self.reward_fn) - # 计算旧的log probabilities - 参考OneStepOffRayTrainer + # recompute old_log_probs with marked_timer("old_log_prob", timing_raw, color="blue"): old_log_prob = self.actor_wg.compute_log_prob(batch) entropys = old_log_prob.batch["entropys"] @@ -452,8 +406,32 @@ def fit(self): old_log_prob.batch.pop("entropys") batch = batch.union(old_log_prob) - # 计算reference log probabilities + if "rollout_log_probs" in batch.batch.keys(): + # TODO: we may want to add diff of probs too. + rollout_old_log_probs = batch.batch["rollout_log_probs"] + actor_old_log_probs = batch.batch["old_log_probs"] + attention_mask = batch.batch["attention_mask"] + responses = batch.batch["responses"] + response_length = responses.size(1) + response_mask = attention_mask[:, -response_length:] + + rollout_probs = torch.exp(rollout_old_log_probs) + actor_probs = torch.exp(actor_old_log_probs) + rollout_probs_diff = torch.abs(rollout_probs - actor_probs) + rollout_probs_diff = torch.masked_select(rollout_probs_diff, response_mask.bool()) + rollout_probs_diff_max = torch.max(rollout_probs_diff) + rollout_probs_diff_mean = torch.mean(rollout_probs_diff) + rollout_probs_diff_std = torch.std(rollout_probs_diff) + metrics.update( + { + "training/rollout_probs_diff_max": rollout_probs_diff_max.detach().item(), + "training/rollout_probs_diff_mean": rollout_probs_diff_mean.detach().item(), + "training/rollout_probs_diff_std": rollout_probs_diff_std.detach().item(), + } + ) + if self.use_reference_policy: + # compute reference log_prob with marked_timer("ref", timing_raw, color="olive"): if not self.ref_in_actor: ref_log_prob = self.ref_policy_wg.compute_ref_log_prob(batch) @@ -461,22 +439,23 @@ def fit(self): ref_log_prob = self.actor_wg.compute_ref_log_prob(batch) batch = batch.union(ref_log_prob) - # 计算values + # compute values if self.use_critic: with marked_timer("values", timing_raw, color="cyan"): values = self.critic_wg.compute_values(batch) batch = batch.union(values) - # 处理奖励和优势计算 with marked_timer("adv", timing_raw, color="brown"): - if self.config.reward_model.get("launch_reward_fn_async", False): + # we combine with rule-based rm + reward_extra_infos_dict: dict[str, list] + if self.config.reward_model.launch_reward_fn_async: reward_tensor, reward_extra_infos_dict = ray.get(future_reward) batch.batch["token_level_scores"] = reward_tensor if reward_extra_infos_dict: batch.non_tensor_batch.update({k: np.array(v) for k, v in reward_extra_infos_dict.items()}) - # 应用KL惩罚 + # compute rewards. apply_kl_penalty if available if self.config.algorithm.use_kl_in_reward: batch, kl_metrics = apply_kl_penalty( batch, kl_ctrl=self.kl_ctrl_in_reward, kl_penalty=self.config.algorithm.kl_penalty @@ -485,8 +464,11 @@ def fit(self): else: batch.batch["token_level_rewards"] = batch.batch["token_level_scores"] - # 计算优势 - norm_adv_by_std_in_grpo = self.config.algorithm.get("norm_adv_by_std_in_grpo", True) + # compute advantages, executed on the driver process + + norm_adv_by_std_in_grpo = self.config.algorithm.get( + "norm_adv_by_std_in_grpo", True + ) # GRPO adv normalization factor batch = compute_advantage( batch, @@ -498,32 +480,34 @@ def fit(self): config=self.config.algorithm, ) - # 更新critic + # update critic if self.use_critic: with marked_timer("update_critic", timing_raw, color="pink"): critic_output = self.critic_wg.update_critic(batch) critic_output_metrics = reduce_metrics(critic_output.meta_info["metrics"]) metrics.update(critic_output_metrics) - # 更新actor + # implement critic warmup if self.config.trainer.critic_warmup <= self.global_steps: + # update actor with marked_timer("update_actor", timing_raw, color="red"): batch.meta_info["multi_turn"] = self.config.actor_rollout_ref.rollout.multi_turn.enable actor_output = self.actor_wg.update_actor(batch) actor_output_metrics = reduce_metrics(actor_output.meta_info["metrics"]) metrics.update(actor_output_metrics) - # 同步参数到Rollouter - with marked_timer("sync_params", timing_raw, color="purple"): - self._sync_parameters_to_rollouter() - - # 记录rollout生成 + # Log rollout generations if enabled rollout_data_dir = self.config.trainer.get("rollout_data_dir", None) if rollout_data_dir: with marked_timer("dump_rollout_generations", timing_raw, color="green"): inputs = self.tokenizer.batch_decode(batch.batch["prompts"], skip_special_tokens=True) outputs = self.tokenizer.batch_decode(batch.batch["responses"], skip_special_tokens=True) scores = batch.batch["token_level_scores"].sum(-1).cpu().tolist() + if "request_id" in batch.non_tensor_batch: + reward_extra_infos_dict.setdefault( + "request_id", + batch.non_tensor_batch["request_id"].tolist(), + ) self._dump_generations( inputs=inputs, outputs=outputs, @@ -532,97 +516,80 @@ def fit(self): dump_path=rollout_data_dir, ) - # 验证 + # validate if ( - self.val_reward_fn is not None - and self.config.trainer.test_freq > 0 - and (is_last_step or self.global_steps % self.config.trainer.test_freq == 0) + self.val_reward_fn is not None + and self.config.trainer.test_freq > 0 + and (is_last_step or self.global_steps % self.config.trainer.test_freq == 0) ): with marked_timer("testing", timing_raw, color="green"): - val_metrics = self._validate() + val_metrics: dict = self._validate() if is_last_step: last_val_metrics = val_metrics - print(last_val_metrics) - if val_metrics: - metrics.update(val_metrics) - - # 保存检查点 + metrics.update(val_metrics) + + # Check if the ESI (Elastic Server Instance)/training plan is close to expiration. + esi_close_to_expiration = should_save_ckpt_esi( + max_steps_duration=self.max_steps_duration, + redundant_time=self.config.trainer.esi_redundant_time, + ) + # Check if the conditions for saving a checkpoint are met. + # The conditions include a mandatory condition (1) and + # one of the following optional conditions (2/3/4): + # 1. The save frequency is set to a positive value. + # 2. It's the last training step. + # 3. The current step number is a multiple of the save frequency. + # 4. The ESI(Elastic Server Instance)/training plan is close to expiration. q if self.config.trainer.save_freq > 0 and ( - is_last_step or self.global_steps % self.config.trainer.save_freq == 0 + is_last_step + or self.global_steps % self.config.trainer.save_freq == 0 + or esi_close_to_expiration ): + if esi_close_to_expiration: + print("Force saving checkpoint: ESI instance expiration approaching.") with marked_timer("save_checkpoint", timing_raw, color="green"): self._save_checkpoint() - # 收集指标 - 参考OneStepOffRayTrainer的指标收集 + with marked_timer("stop_profile", timing_raw): + self._stop_profiling(do_profile) + + steps_duration = timing_raw["step"] + self.max_steps_duration = max(self.max_steps_duration, steps_duration) + + # training metrics metrics.update( { "training/global_step": self.global_steps, - "training/param_version": self.current_param_version, - "training/param_sync_count": self.param_sync_count, + "training/epoch": epoch, } ) - - # 数据和性能指标 + # collect metrics metrics.update(compute_data_metrics(batch=batch, use_critic=self.use_critic)) metrics.update(compute_timing_metrics(batch=batch, timing_raw=timing_raw)) - + # TODO: implement actual tflpo and theoretical tflpo n_gpus = self.resource_pool_manager.get_n_gpus() metrics.update(compute_throughout_metrics(batch=batch, timing_raw=timing_raw, n_gpus=n_gpus)) - # 队列状态指标 - queue_size = self.message_queue_client.get_queue_size() - queue_stats = self.message_queue_client.get_statistics() - metrics.update( - { - "queue/size": queue_size, - "queue/total_produced": queue_stats["total_produced"], - "queue/total_consumed": queue_stats["total_consumed"], - "queue/dropped_samples": queue_stats["dropped_samples"], - } - ) + # this is experimental and may be changed/removed in the future in favor of a general-purpose one + if isinstance(self.train_dataloader.sampler, AbstractCurriculumSampler): + self.train_dataloader.sampler.update(batch=batch) - # 记录日志 - logger_tracker.log(data=metrics, step=self.global_steps) + # TODO: make a canonical logger that supports various backend + logger.log(data=metrics, step=self.global_steps) - # 更新进度条 progress_bar.update(1) - progress_bar.set_postfix( - { - "reward": f"{metrics.get('reward/mean', 0):.3f}", - "kl": f"{metrics.get('actor/approx_kl', 0):.3f}", - "queue_size": queue_size, - "param_ver": self.current_param_version, - "avg_age": f"{metrics.get('freshness/avg_sample_age', 0):.1f}", - } - ) - - if do_profile: - self.actor_wg.stop_profile() - if self.use_reference_policy and not self.ref_in_actor: - self.ref_policy_wg.stop_profile() - if self.use_critic: - self.critic_wg.stop_profile() - if self.use_rm: - self.rm_wg.stop_profile() - self.global_steps += 1 - self.processed_samples += len(batch_samples) if is_last_step: - break - - progress_bar.close() - logger.info(f"Training completed after {self.global_steps} steps") - - # 最终验证 - if self.val_reward_fn is not None: - val_metrics = self._validate() - if val_metrics: - pprint(f"Final validation metrics: {val_metrics}") - logger_tracker.log(data=val_metrics, step=self.global_steps) + pprint(f"Final validation metrics: {last_val_metrics}") + progress_bar.close() + return - # 最终检查点保存 - self._save_checkpoint() + # this is experimental and may be changed/removed in the future + # in favor of a general-purpose data buffer pool + if hasattr(self.train_dataset, "on_batch_end"): + # The dataset may be changed after each training batch + self.train_dataset.on_batch_end(batch=batch) def get_statistics(self) -> dict: """获取训练统计信息""" diff --git a/recipe/one_step_off_policy/ray_trainer.py b/recipe/one_step_off_policy/ray_trainer.py index 1f7011bdf54..127ed2d0c24 100644 --- a/recipe/one_step_off_policy/ray_trainer.py +++ b/recipe/one_step_off_policy/ray_trainer.py @@ -105,7 +105,7 @@ def __init__( val_dataset: Dataset | None = None, collate_fn=None, train_sampler: Sampler | None = None, - device_name="cuda", + device_name=None, ): """ Initialize distributed PPO trainer with Ray backend. @@ -143,8 +143,11 @@ def __init__( self.use_reference_policy = Role.RefPolicy in role_worker_mapping self.use_rm = Role.RewardModel in role_worker_mapping self.ray_worker_group_cls = ray_worker_group_cls - self.device_name = device_name - self.validation_generations_logger = ValidationGenerationsLogger() + self.device_name = device_name if device_name else self.config.trainer.device + self.validation_generations_logger = ValidationGenerationsLogger( + project_name=self.config.trainer.project_name, + experiment_name=self.config.trainer.experiment_name, + ) # if ref_in_actor is True, the reference policy will be actor without lora applied self.ref_in_actor = config.actor_rollout_ref.model.get("lora_rank", 0) > 0 @@ -286,7 +289,7 @@ def init_workers(self): # create async rollout manager and request scheduler self.async_rollout_mode = False - if self.config.actor_rollout_ref.rollout.mode == "async" and self._is_rollout: + if self.config.actor_rollout_ref.rollout.mode == "async": from verl.workers.rollout.async_server import AsyncLLMServerManager self.async_rollout_mode = True From a5ee455ecaad386f3288108da4f174cdf06b6e7f Mon Sep 17 00:00:00 2001 From: ArronHZG Date: Thu, 31 Jul 2025 21:09:28 +0800 Subject: [PATCH 014/182] train --- recipe/one_step_off_policy/main_ppo.py | 2 +- recipe/one_step_off_policy/ray_trainer.py | 278 ++---------- verl/trainer/ppo/ray_trainer.py | 527 +++++++++++----------- 3 files changed, 295 insertions(+), 512 deletions(-) diff --git a/recipe/one_step_off_policy/main_ppo.py b/recipe/one_step_off_policy/main_ppo.py index d6072c5521e..0a037df17fa 100644 --- a/recipe/one_step_off_policy/main_ppo.py +++ b/recipe/one_step_off_policy/main_ppo.py @@ -188,7 +188,7 @@ def run(self, config): def main(config): from verl.trainer.main_ppo import run_ppo - run_ppo(config) + run_ppo(config, OneStepOffTaskRunner) if __name__ == "__main__": diff --git a/recipe/one_step_off_policy/ray_trainer.py b/recipe/one_step_off_policy/ray_trainer.py index 127ed2d0c24..c1687561d01 100644 --- a/recipe/one_step_off_policy/ray_trainer.py +++ b/recipe/one_step_off_policy/ray_trainer.py @@ -18,40 +18,25 @@ This trainer supports model-agonistic model initialization with huggingface """ -import uuid +import warnings from pprint import pprint -import numpy as np import ray -import torch from omegaconf import OmegaConf from torch.utils.data import Dataset, Sampler from tqdm import tqdm -from verl import DataProto from verl.single_controller.ray import RayClassWithInitArgs, RayWorkerGroup from verl.single_controller.ray.base import create_colocated_worker_cls from verl.trainer.ppo import core_algos -from verl.trainer.ppo.core_algos import AdvantageEstimator, agg_loss -from verl.trainer.ppo.metric_utils import ( - compute_data_metrics, - compute_throughout_metrics, - compute_timing_metrics, -) +from verl.trainer.ppo.core_algos import AdvantageEstimator from verl.trainer.ppo.ray_trainer import ( RayPPOTrainer, ResourcePoolManager, Role, WorkerType, - apply_kl_penalty, - compute_advantage, - compute_response_mask, ) -from verl.trainer.ppo.reward import compute_reward, compute_reward_async from verl.utils.debug import marked_timer -from verl.utils.metric import ( - reduce_metrics, -) from verl.utils.tracking import ValidationGenerationsLogger @@ -154,24 +139,20 @@ def __init__( # define in-reward KL control # kl loss control currently not suppoorted - if config.algorithm.use_kl_in_reward: - self.kl_ctrl_in_reward = core_algos.get_kl_controller(config.algorithm.kl_ctrl) + if self.config.algorithm.use_kl_in_reward: + self.kl_ctrl_in_reward = core_algos.get_kl_controller(self.config.algorithm.kl_ctrl) - if self.config.algorithm.adv_estimator == AdvantageEstimator.GAE: + if config.critic.enable is not None: + self.use_critic = bool(config.critic.enable) + elif self.config.algorithm.adv_estimator == AdvantageEstimator.GAE: self.use_critic = True - elif self.config.algorithm.adv_estimator in [ - AdvantageEstimator.GRPO, - AdvantageEstimator.GRPO_PASSK, - AdvantageEstimator.REINFORCE_PLUS_PLUS, - # AdvantageEstimator.REMAX, # TODO:REMAX advantage estimator is not yet supported in one_step_off_policy - AdvantageEstimator.RLOO, - AdvantageEstimator.OPO, - AdvantageEstimator.REINFORCE_PLUS_PLUS_BASELINE, - AdvantageEstimator.GPG, - ]: - self.use_critic = False else: - raise NotImplementedError + warnings.warn( + "Disabled critic as algorithm.adv_estimator != gae. " + "If it is not intended, please set critic.enable=True", + stacklevel=2, + ) + self.use_critic = False self._validate_config() self._create_dataloader(train_dataset, val_dataset, collate_fn, train_sampler) @@ -323,23 +304,7 @@ def _async_gen_next_batch(self, continuous_iterator): except Exception as e: print(f"Error in async_gen_next_batch: {e}") return None - batch = DataProto.from_single_dict(batch_dict) - # pop those keys for generation - batch_keys_to_pop = ["input_ids", "attention_mask", "position_ids"] - non_tensor_batch_keys_to_pop = ["raw_prompt_ids"] - if "multi_modal_data" in batch.non_tensor_batch: - non_tensor_batch_keys_to_pop.append("multi_modal_data") - if "raw_prompt" in batch.non_tensor_batch: - non_tensor_batch_keys_to_pop.append("raw_prompt") - if "tools_kwargs" in batch.non_tensor_batch: - non_tensor_batch_keys_to_pop.append("tools_kwargs") - if "interaction_kwargs" in batch.non_tensor_batch: - non_tensor_batch_keys_to_pop.append("interaction_kwargs") - gen_batch = batch.pop( - batch_keys=batch_keys_to_pop, - non_tensor_batch_keys=non_tensor_batch_keys_to_pop, - ) - gen_batch = gen_batch.repeat(repeat_times=self.config.actor_rollout_ref.rollout.n, interleave=True) + batch, gen_batch = self._prepare_generate_batch(batch_dict) # sync weights from actor to rollout self.sync_rollout_weights() # async generation @@ -385,6 +350,7 @@ def fit(self): # we start from step 1 self.global_steps += 1 last_val_metrics = None + self.max_steps_duration = 0 # across epoch iterator continuous_iterator = self._create_continuous_iterator() @@ -393,24 +359,16 @@ def fit(self): batch_data_future = self._async_gen_next_batch(continuous_iterator) while batch_data_future is not None: + metrics = {} + timing_raw = {} + do_profile = ( self.global_steps in self.config.trainer.profile_steps if self.config.trainer.profile_steps is not None else False ) - if do_profile: - self.actor_wg.start_profile() - if not self.hybrid_engine: - self.rollout_wg.start_profile() - if self.use_reference_policy: - self.ref_policy_wg.start_profile() - if self.use_critic: - self.critic_wg.start_profile() - if self.use_rm: - self.rm_wg.start_profile() + self._start_profiling(do_profile, timing_raw) - metrics = {} - timing_raw = {} is_last_step = self.global_steps >= self.total_training_steps with marked_timer("step", timing_raw): @@ -425,184 +383,15 @@ def fit(self): if not is_last_step: batch_data_future = self._async_gen_next_batch(continuous_iterator) - batch.non_tensor_batch["uid"] = np.array( - [str(uuid.uuid4()) for _ in range(len(batch.batch))], dtype=object - ) - # repeat to align with repeated responses in rollout - batch = batch.repeat(repeat_times=self.config.actor_rollout_ref.rollout.n, interleave=True) - batch = batch.union(gen_batch_output) - - batch.batch["response_mask"] = compute_response_mask(batch) - # Balance the number of valid tokens across DP ranks. - # NOTE: This usually changes the order of data in the `batch`, - # which won't affect the advantage calculation (since it's based on uid), - # but might affect the loss calculation (due to the change of mini-batching). - # TODO: Decouple the DP balancing and mini-batching. - if self.config.trainer.balance_batch: - self._balance_batch(batch, metrics=metrics) - - # compute global_valid tokens - batch.meta_info["global_token_num"] = torch.sum(batch.batch["attention_mask"], dim=-1).tolist() - - with marked_timer("reward", timing_raw, color="yellow"): - # compute reward model score - if self.use_rm: - reward_tensor = self.rm_wg.compute_rm_score(batch) - batch = batch.union(reward_tensor) - - if self.config.reward_model.launch_reward_fn_async: - future_reward = compute_reward_async.remote(batch, self.config, self.tokenizer) - else: - reward_tensor, reward_extra_infos_dict = compute_reward(batch, self.reward_fn) - - # recompute old_log_probs - with marked_timer("old_log_prob", timing_raw, color="blue"): - old_log_prob = self.actor_wg.compute_log_prob(batch) - entropys = old_log_prob.batch["entropys"] - response_masks = batch.batch["response_mask"] - loss_agg_mode = self.config.actor_rollout_ref.actor.loss_agg_mode - entropy_agg = agg_loss(loss_mat=entropys, loss_mask=response_masks, loss_agg_mode=loss_agg_mode) - old_log_prob_metrics = {"actor/entropy": entropy_agg.detach().item()} - metrics.update(old_log_prob_metrics) - old_log_prob.batch.pop("entropys") - batch = batch.union(old_log_prob) - - if "rollout_log_probs" in batch.batch.keys(): - # TODO: we may want to add diff of probs too. - rollout_old_log_probs = batch.batch["rollout_log_probs"] - actor_old_log_probs = batch.batch["old_log_probs"] - attention_mask = batch.batch["attention_mask"] - responses = batch.batch["responses"] - response_length = responses.size(1) - response_mask = attention_mask[:, -response_length:] - - rollout_probs = torch.exp(rollout_old_log_probs) - actor_probs = torch.exp(actor_old_log_probs) - rollout_probs_diff = torch.abs(rollout_probs - actor_probs) - rollout_probs_diff = torch.masked_select(rollout_probs_diff, response_mask.bool()) - rollout_probs_diff_max = torch.max(rollout_probs_diff) - rollout_probs_diff_mean = torch.mean(rollout_probs_diff) - rollout_probs_diff_std = torch.std(rollout_probs_diff) - metrics.update( - { - "training/rollout_probs_diff_max": rollout_probs_diff_max.detach().item(), - "training/rollout_probs_diff_mean": rollout_probs_diff_mean.detach().item(), - "training/rollout_probs_diff_std": rollout_probs_diff_std.detach().item(), - } - ) - - if self.use_reference_policy: - # compute reference log_prob - with marked_timer("ref", timing_raw, color="olive"): - if not self.ref_in_actor: - ref_log_prob = self.ref_policy_wg.compute_ref_log_prob(batch) - else: - ref_log_prob = self.actor_wg.compute_ref_log_prob(batch) - batch = batch.union(ref_log_prob) - - # compute values - if self.use_critic: - with marked_timer("values", timing_raw, color="cyan"): - values = self.critic_wg.compute_values(batch) - batch = batch.union(values) - - with marked_timer("adv", timing_raw, color="brown"): - # we combine with rule-based rm - reward_extra_infos_dict: dict[str, list] - if self.config.reward_model.launch_reward_fn_async: - reward_tensor, reward_extra_infos_dict = ray.get(future_reward) - batch.batch["token_level_scores"] = reward_tensor - - if reward_extra_infos_dict: - batch.non_tensor_batch.update({k: np.array(v) for k, v in reward_extra_infos_dict.items()}) - - # compute rewards. apply_kl_penalty if available - if self.config.algorithm.use_kl_in_reward: - batch, kl_metrics = apply_kl_penalty( - batch, kl_ctrl=self.kl_ctrl_in_reward, kl_penalty=self.config.algorithm.kl_penalty - ) - metrics.update(kl_metrics) - else: - batch.batch["token_level_rewards"] = batch.batch["token_level_scores"] - - # compute advantages, executed on the driver process - - norm_adv_by_std_in_grpo = self.config.algorithm.get( - "norm_adv_by_std_in_grpo", True - ) # GRPO adv normalization factor - - batch = compute_advantage( - batch, - adv_estimator=self.config.algorithm.adv_estimator, - gamma=self.config.algorithm.gamma, - lam=self.config.algorithm.lam, - num_repeat=self.config.actor_rollout_ref.rollout.n, - norm_adv_by_std_in_grpo=norm_adv_by_std_in_grpo, - config=self.config.algorithm, - ) - - # update critic - if self.use_critic: - with marked_timer("update_critic", timing_raw, color="pink"): - critic_output = self.critic_wg.update_critic(batch) - critic_output_metrics = reduce_metrics(critic_output.meta_info["metrics"]) - metrics.update(critic_output_metrics) - - # implement critic warmup - if self.config.trainer.critic_warmup <= self.global_steps: - # update actor - with marked_timer("update_actor", timing_raw, color="red"): - batch.meta_info["multi_turn"] = self.config.actor_rollout_ref.rollout.multi_turn.enable - actor_output = self.actor_wg.update_actor(batch) - actor_output_metrics = reduce_metrics(actor_output.meta_info["metrics"]) - metrics.update(actor_output_metrics) - - # Log rollout generations if enabled - rollout_data_dir = self.config.trainer.get("rollout_data_dir", None) - if rollout_data_dir: - with marked_timer("dump_rollout_generations", timing_raw, color="green"): - inputs = self.tokenizer.batch_decode(batch.batch["prompts"], skip_special_tokens=True) - outputs = self.tokenizer.batch_decode(batch.batch["responses"], skip_special_tokens=True) - scores = batch.batch["token_level_scores"].sum(-1).cpu().tolist() - self._dump_generations( - inputs=inputs, - outputs=outputs, - scores=scores, - reward_extra_infos_dict=reward_extra_infos_dict, - dump_path=rollout_data_dir, - ) - - # validate - if ( - self.val_reward_fn is not None - and self.config.trainer.test_freq > 0 - and (is_last_step or self.global_steps % self.config.trainer.test_freq == 0) - ): - with marked_timer("testing", timing_raw, color="green"): - val_metrics: dict = self._validate() - if is_last_step: - last_val_metrics = val_metrics - metrics.update(val_metrics) - - if self.config.trainer.save_freq > 0 and ( - is_last_step or self.global_steps % self.config.trainer.save_freq == 0 - ): - with marked_timer("save_checkpoint", timing_raw, color="green"): - self._save_checkpoint() - - # training metrics - metrics.update( - { - "training/global_step": self.global_steps, - "training/epoch": epoch, - } - ) - # collect metrics - metrics.update(compute_data_metrics(batch=batch, use_critic=self.use_critic)) - metrics.update(compute_timing_metrics(batch=batch, timing_raw=timing_raw)) - # TODO: implement actual tflpo and theoretical tflpo - n_gpus = self.resource_pool_manager.get_n_gpus() - metrics.update(compute_throughout_metrics(batch=batch, timing_raw=timing_raw, n_gpus=n_gpus)) + batch = self._post_generate_batch(batch, gen_batch_output, metrics) + batch, reward_extra_infos_dict = self._process_batch_common(batch, metrics, timing_raw) + self._log_rollout(batch, reward_extra_infos_dict, timing_raw) + last_val_metrics = self._validate_metrics(is_last_step, last_val_metrics, metrics, timing_raw) + self._check_save_checkpoint(is_last_step, timing_raw) + + self._stop_profiling(do_profile, timing_raw) + self._collect_metrics(batch, epoch, metrics, timing_raw) + self._post_batch_processing(batch) # TODO: make a canonical logger that supports various backend logger.log(data=metrics, step=self.global_steps) @@ -610,17 +399,6 @@ def fit(self): progress_bar.update(1) self.global_steps += 1 - if do_profile: - self.actor_wg.stop_profile() - if not self.hybrid_engine: - self.rollout_wg.stop_profile() - if self.use_reference_policy: - self.ref_policy_wg.stop_profile() - if self.use_critic: - self.critic_wg.stop_profile() - if self.use_rm: - self.rm_wg.stop_profile() - if is_last_step: pprint(f"Final validation metrics: {last_val_metrics}") progress_bar.close() diff --git a/verl/trainer/ppo/ray_trainer.py b/verl/trainer/ppo/ray_trainer.py index 6a82a4bcf2b..49334db6bcd 100644 --- a/verl/trainer/ppo/ray_trainer.py +++ b/verl/trainer/ppo/ray_trainer.py @@ -989,27 +989,29 @@ def _load_checkpoint(self): else: print(f"Warning: No dataloader state found at {dataloader_local_path}, will start from scratch") - def _start_profiling(self, do_profile: bool) -> None: + def _start_profiling(self, do_profile: bool, timing_raw) -> None: """Start profiling for all worker groups if profiling is enabled.""" - if do_profile: - self.actor_rollout_wg.start_profile(role="e2e", profile_step=self.global_steps) - if self.use_reference_policy: - self.ref_policy_wg.start_profile() - if self.use_critic: - self.critic_wg.start_profile() - if self.use_rm: - self.rm_wg.start_profile() - - def _stop_profiling(self, do_profile: bool) -> None: + with marked_timer("start_profile", timing_raw): + if do_profile: + self.actor_rollout_wg.start_profile(role="e2e", profile_step=self.global_steps) + if self.use_reference_policy: + self.ref_policy_wg.start_profile() + if self.use_critic: + self.critic_wg.start_profile() + if self.use_rm: + self.rm_wg.start_profile() + + def _stop_profiling(self, do_profile: bool, timing_raw) -> None: """Stop profiling for all worker groups if profiling is enabled.""" - if do_profile: - self.actor_rollout_wg.stop_profile() - if self.use_reference_policy: - self.ref_policy_wg.stop_profile() - if self.use_critic: - self.critic_wg.stop_profile() - if self.use_rm: - self.rm_wg.stop_profile() + with marked_timer("stop_profile", timing_raw): + if do_profile: + self.actor_rollout_wg.stop_profile() + if self.use_reference_policy: + self.ref_policy_wg.stop_profile() + if self.use_critic: + self.critic_wg.stop_profile() + if self.use_rm: + self.rm_wg.stop_profile() def _balance_batch(self, batch: DataProto, metrics, logging_prefix="global_seqlen"): """Reorder the data on single controller such that each dp rank gets similar total tokens""" @@ -1079,35 +1081,9 @@ def fit(self): if self.config.trainer.profile_steps is not None else False ) - with marked_timer("start_profile", timing_raw): - self._start_profiling(do_profile) - - batch: DataProto = DataProto.from_single_dict(batch_dict) - - # pop those keys for generation - batch_keys_to_pop = ["input_ids", "attention_mask", "position_ids"] - non_tensor_batch_keys_to_pop = ["raw_prompt_ids"] - if "multi_modal_data" in batch.non_tensor_batch: - non_tensor_batch_keys_to_pop.append("multi_modal_data") - if "raw_prompt" in batch.non_tensor_batch: - non_tensor_batch_keys_to_pop.append("raw_prompt") - if "tools_kwargs" in batch.non_tensor_batch: - non_tensor_batch_keys_to_pop.append("tools_kwargs") - if "interaction_kwargs" in batch.non_tensor_batch: - non_tensor_batch_keys_to_pop.append("interaction_kwargs") - if "index" in batch.non_tensor_batch: - non_tensor_batch_keys_to_pop.append("index") - if "agent_name" in batch.non_tensor_batch: - non_tensor_batch_keys_to_pop.append("agent_name") - - gen_batch = batch.pop( - batch_keys=batch_keys_to_pop, - non_tensor_batch_keys=non_tensor_batch_keys_to_pop, - ) + self._start_profiling(do_profile, timing_raw) - # pass global_steps to trace - gen_batch.meta_info["global_steps"] = self.global_steps - gen_batch = gen_batch.repeat(repeat_times=self.config.actor_rollout_ref.rollout.n, interleave=True) + batch, gen_batch = self._prepare_generate_batch(batch_dict) is_last_step = self.global_steps >= self.total_training_steps @@ -1139,216 +1115,15 @@ def fit(self): del gen_baseline_batch, gen_baseline_output - batch.non_tensor_batch["uid"] = np.array( - [str(uuid.uuid4()) for _ in range(len(batch.batch))], dtype=object - ) - # repeat to align with repeated responses in rollout - batch = batch.repeat(repeat_times=self.config.actor_rollout_ref.rollout.n, interleave=True) - batch = batch.union(gen_batch_output) - - if "response_mask" not in batch.batch.keys(): - batch.batch["response_mask"] = compute_response_mask(batch) - # Balance the number of valid tokens across DP ranks. - # NOTE: This usually changes the order of data in the `batch`, - # which won't affect the advantage calculation (since it's based on uid), - # but might affect the loss calculation (due to the change of mini-batching). - # TODO: Decouple the DP balancing and mini-batching. - if self.config.trainer.balance_batch: - self._balance_batch(batch, metrics=metrics) - - # compute global_valid tokens - batch.meta_info["global_token_num"] = torch.sum(batch.batch["attention_mask"], dim=-1).tolist() - - with marked_timer("reward", timing_raw, color="yellow"): - # compute reward model score - if self.use_rm: - reward_tensor = self.rm_wg.compute_rm_score(batch) - batch = batch.union(reward_tensor) - - if self.config.reward_model.launch_reward_fn_async: - future_reward = compute_reward_async.remote(data=batch, reward_fn=self.reward_fn) - else: - reward_tensor, reward_extra_infos_dict = compute_reward(batch, self.reward_fn) - - # recompute old_log_probs - with marked_timer("old_log_prob", timing_raw, color="blue"): - old_log_prob = self.actor_rollout_wg.compute_log_prob(batch) - entropys = old_log_prob.batch["entropys"] - response_masks = batch.batch["response_mask"] - loss_agg_mode = self.config.actor_rollout_ref.actor.loss_agg_mode - entropy_agg = agg_loss(loss_mat=entropys, loss_mask=response_masks, loss_agg_mode=loss_agg_mode) - old_log_prob_metrics = {"actor/entropy": entropy_agg.detach().item()} - metrics.update(old_log_prob_metrics) - old_log_prob.batch.pop("entropys") - batch = batch.union(old_log_prob) - - if "rollout_log_probs" in batch.batch.keys(): - # TODO: we may want to add diff of probs too. - rollout_old_log_probs = batch.batch["rollout_log_probs"] - actor_old_log_probs = batch.batch["old_log_probs"] - attention_mask = batch.batch["attention_mask"] - responses = batch.batch["responses"] - response_length = responses.size(1) - response_mask = attention_mask[:, -response_length:] - - rollout_probs = torch.exp(rollout_old_log_probs) - actor_probs = torch.exp(actor_old_log_probs) - rollout_probs_diff = torch.abs(rollout_probs - actor_probs) - rollout_probs_diff = torch.masked_select(rollout_probs_diff, response_mask.bool()) - rollout_probs_diff_max = torch.max(rollout_probs_diff) - rollout_probs_diff_mean = torch.mean(rollout_probs_diff) - rollout_probs_diff_std = torch.std(rollout_probs_diff) - metrics.update( - { - "training/rollout_probs_diff_max": rollout_probs_diff_max.detach().item(), - "training/rollout_probs_diff_mean": rollout_probs_diff_mean.detach().item(), - "training/rollout_probs_diff_std": rollout_probs_diff_std.detach().item(), - } - ) - - if self.use_reference_policy: - # compute reference log_prob - with marked_timer("ref", timing_raw, color="olive"): - if not self.ref_in_actor: - ref_log_prob = self.ref_policy_wg.compute_ref_log_prob(batch) - else: - ref_log_prob = self.actor_rollout_wg.compute_ref_log_prob(batch) - batch = batch.union(ref_log_prob) - - # compute values - if self.use_critic: - with marked_timer("values", timing_raw, color="cyan"): - values = self.critic_wg.compute_values(batch) - batch = batch.union(values) - - with marked_timer("adv", timing_raw, color="brown"): - # we combine with rule-based rm - reward_extra_infos_dict: dict[str, list] - if self.config.reward_model.launch_reward_fn_async: - reward_tensor, reward_extra_infos_dict = ray.get(future_reward) - batch.batch["token_level_scores"] = reward_tensor - - if reward_extra_infos_dict: - batch.non_tensor_batch.update({k: np.array(v) for k, v in reward_extra_infos_dict.items()}) - - # compute rewards. apply_kl_penalty if available - if self.config.algorithm.use_kl_in_reward: - batch, kl_metrics = apply_kl_penalty( - batch, kl_ctrl=self.kl_ctrl_in_reward, kl_penalty=self.config.algorithm.kl_penalty - ) - metrics.update(kl_metrics) - else: - batch.batch["token_level_rewards"] = batch.batch["token_level_scores"] - - # compute advantages, executed on the driver process - - norm_adv_by_std_in_grpo = self.config.algorithm.get( - "norm_adv_by_std_in_grpo", True - ) # GRPO adv normalization factor - - batch = compute_advantage( - batch, - adv_estimator=self.config.algorithm.adv_estimator, - gamma=self.config.algorithm.gamma, - lam=self.config.algorithm.lam, - num_repeat=self.config.actor_rollout_ref.rollout.n, - norm_adv_by_std_in_grpo=norm_adv_by_std_in_grpo, - config=self.config.algorithm, - ) - - # update critic - if self.use_critic: - with marked_timer("update_critic", timing_raw, color="pink"): - critic_output = self.critic_wg.update_critic(batch) - critic_output_metrics = reduce_metrics(critic_output.meta_info["metrics"]) - metrics.update(critic_output_metrics) - - # implement critic warmup - if self.config.trainer.critic_warmup <= self.global_steps: - # update actor - with marked_timer("update_actor", timing_raw, color="red"): - batch.meta_info["multi_turn"] = self.config.actor_rollout_ref.rollout.multi_turn.enable - actor_output = self.actor_rollout_wg.update_actor(batch) - actor_output_metrics = reduce_metrics(actor_output.meta_info["metrics"]) - metrics.update(actor_output_metrics) - - # Log rollout generations if enabled - rollout_data_dir = self.config.trainer.get("rollout_data_dir", None) - if rollout_data_dir: - with marked_timer("dump_rollout_generations", timing_raw, color="green"): - inputs = self.tokenizer.batch_decode(batch.batch["prompts"], skip_special_tokens=True) - outputs = self.tokenizer.batch_decode(batch.batch["responses"], skip_special_tokens=True) - scores = batch.batch["token_level_scores"].sum(-1).cpu().tolist() - if "request_id" in batch.non_tensor_batch: - reward_extra_infos_dict.setdefault( - "request_id", - batch.non_tensor_batch["request_id"].tolist(), - ) - self._dump_generations( - inputs=inputs, - outputs=outputs, - scores=scores, - reward_extra_infos_dict=reward_extra_infos_dict, - dump_path=rollout_data_dir, - ) - - # validate - if ( - self.val_reward_fn is not None - and self.config.trainer.test_freq > 0 - and (is_last_step or self.global_steps % self.config.trainer.test_freq == 0) - ): - with marked_timer("testing", timing_raw, color="green"): - val_metrics: dict = self._validate() - if is_last_step: - last_val_metrics = val_metrics - metrics.update(val_metrics) - - # Check if the ESI (Elastic Server Instance)/training plan is close to expiration. - esi_close_to_expiration = should_save_ckpt_esi( - max_steps_duration=self.max_steps_duration, - redundant_time=self.config.trainer.esi_redundant_time, - ) - # Check if the conditions for saving a checkpoint are met. - # The conditions include a mandatory condition (1) and - # one of the following optional conditions (2/3/4): - # 1. The save frequency is set to a positive value. - # 2. It's the last training step. - # 3. The current step number is a multiple of the save frequency. - # 4. The ESI(Elastic Server Instance)/training plan is close to expiration. - if self.config.trainer.save_freq > 0 and ( - is_last_step - or self.global_steps % self.config.trainer.save_freq == 0 - or esi_close_to_expiration - ): - if esi_close_to_expiration: - print("Force saving checkpoint: ESI instance expiration approaching.") - with marked_timer("save_checkpoint", timing_raw, color="green"): - self._save_checkpoint() - - with marked_timer("stop_profile", timing_raw): - self._stop_profiling(do_profile) - - steps_duration = timing_raw["step"] - self.max_steps_duration = max(self.max_steps_duration, steps_duration) - - # training metrics - metrics.update( - { - "training/global_step": self.global_steps, - "training/epoch": epoch, - } - ) - # collect metrics - metrics.update(compute_data_metrics(batch=batch, use_critic=self.use_critic)) - metrics.update(compute_timing_metrics(batch=batch, timing_raw=timing_raw)) - # TODO: implement actual tflpo and theoretical tflpo - n_gpus = self.resource_pool_manager.get_n_gpus() - metrics.update(compute_throughout_metrics(batch=batch, timing_raw=timing_raw, n_gpus=n_gpus)) + batch = self._post_generate_batch(batch, gen_batch_output, metrics) + batch, reward_extra_infos_dict = self._process_batch_common(batch, metrics, timing_raw) + self._log_rollout(batch, reward_extra_infos_dict, timing_raw) + last_val_metrics = self._validate_metrics(is_last_step, last_val_metrics, metrics, timing_raw) + self._check_save_checkpoint(is_last_step, timing_raw) - # this is experimental and may be changed/removed in the future in favor of a general-purpose one - if isinstance(self.train_dataloader.sampler, AbstractCurriculumSampler): - self.train_dataloader.sampler.update(batch=batch) + self._stop_profiling(do_profile, timing_raw) + self._collect_metrics(batch, epoch, metrics, timing_raw) + self._post_batch_processing(batch) # TODO: make a canonical logger that supports various backend logger.log(data=metrics, step=self.global_steps) @@ -1361,8 +1136,238 @@ def fit(self): progress_bar.close() return - # this is experimental and may be changed/removed in the future - # in favor of a general-purpose data buffer pool - if hasattr(self.train_dataset, "on_batch_end"): - # The dataset may be changed after each training batch - self.train_dataset.on_batch_end(batch=batch) + def _prepare_generate_batch(self, batch_dict): + batch: DataProto = DataProto.from_single_dict(batch_dict) + # pop those keys for generation + batch_keys_to_pop = ["input_ids", "attention_mask", "position_ids"] + non_tensor_batch_keys_to_pop = ["raw_prompt_ids"] + if "multi_modal_data" in batch.non_tensor_batch: + non_tensor_batch_keys_to_pop.append("multi_modal_data") + if "raw_prompt" in batch.non_tensor_batch: + non_tensor_batch_keys_to_pop.append("raw_prompt") + if "tools_kwargs" in batch.non_tensor_batch: + non_tensor_batch_keys_to_pop.append("tools_kwargs") + if "interaction_kwargs" in batch.non_tensor_batch: + non_tensor_batch_keys_to_pop.append("interaction_kwargs") + if "index" in batch.non_tensor_batch: + non_tensor_batch_keys_to_pop.append("index") + if "agent_name" in batch.non_tensor_batch: + non_tensor_batch_keys_to_pop.append("agent_name") + gen_batch = batch.pop( + batch_keys=batch_keys_to_pop, + non_tensor_batch_keys=non_tensor_batch_keys_to_pop, + ) + # pass global_steps to trace + gen_batch.meta_info["global_steps"] = self.global_steps + gen_batch = gen_batch.repeat(repeat_times=self.config.actor_rollout_ref.rollout.n, interleave=True) + return batch, gen_batch + + def _post_generate_batch(self, batch, gen_batch_output, metrics): + batch.non_tensor_batch["uid"] = np.array([str(uuid.uuid4()) for _ in range(len(batch.batch))], dtype=object) + # repeat to align with repeated responses in rollout + batch = batch.repeat(repeat_times=self.config.actor_rollout_ref.rollout.n, interleave=True) + batch = batch.union(gen_batch_output) + if "response_mask" not in batch.batch.keys(): + batch.batch["response_mask"] = compute_response_mask(batch) + # Balance the number of valid tokens across DP ranks. + # NOTE: This usually changes the order of data in the `batch`, + # which won't affect the advantage calculation (since it's based on uid), + # but might affect the loss calculation (due to the change of mini-batching). + # TODO: Decouple the DP balancing and mini-batching. + if self.config.trainer.balance_batch: + self._balance_batch(batch, metrics=metrics) + # compute global_valid tokens + batch.meta_info["global_token_num"] = torch.sum(batch.batch["attention_mask"], dim=-1).tolist() + return batch + + def _process_batch_common(self, batch, metrics, timing_raw): + with marked_timer("reward", timing_raw, color="yellow"): + # compute reward model score + if self.use_rm: + reward_tensor = self.rm_wg.compute_rm_score(batch) + batch = batch.union(reward_tensor) + + if self.config.reward_model.launch_reward_fn_async: + future_reward = compute_reward_async.remote(data=batch, reward_fn=self.reward_fn) + else: + reward_tensor, reward_extra_infos_dict = compute_reward(batch, self.reward_fn) + # recompute old_log_probs + with marked_timer("old_log_prob", timing_raw, color="blue"): + old_log_prob = self.actor_rollout_wg.compute_log_prob(batch) + entropys = old_log_prob.batch["entropys"] + response_masks = batch.batch["response_mask"] + loss_agg_mode = self.config.actor_rollout_ref.actor.loss_agg_mode + entropy_agg = agg_loss(loss_mat=entropys, loss_mask=response_masks, loss_agg_mode=loss_agg_mode) + old_log_prob_metrics = {"actor/entropy": entropy_agg.detach().item()} + metrics.update(old_log_prob_metrics) + old_log_prob.batch.pop("entropys") + batch = batch.union(old_log_prob) + + if "rollout_log_probs" in batch.batch.keys(): + # TODO: we may want to add diff of probs too. + rollout_old_log_probs = batch.batch["rollout_log_probs"] + actor_old_log_probs = batch.batch["old_log_probs"] + attention_mask = batch.batch["attention_mask"] + responses = batch.batch["responses"] + response_length = responses.size(1) + response_mask = attention_mask[:, -response_length:] + + rollout_probs = torch.exp(rollout_old_log_probs) + actor_probs = torch.exp(actor_old_log_probs) + rollout_probs_diff = torch.abs(rollout_probs - actor_probs) + rollout_probs_diff = torch.masked_select(rollout_probs_diff, response_mask.bool()) + rollout_probs_diff_max = torch.max(rollout_probs_diff) + rollout_probs_diff_mean = torch.mean(rollout_probs_diff) + rollout_probs_diff_std = torch.std(rollout_probs_diff) + metrics.update( + { + "training/rollout_probs_diff_max": rollout_probs_diff_max.detach().item(), + "training/rollout_probs_diff_mean": rollout_probs_diff_mean.detach().item(), + "training/rollout_probs_diff_std": rollout_probs_diff_std.detach().item(), + } + ) + if self.use_reference_policy: + # compute reference log_prob + with marked_timer("ref", timing_raw, color="olive"): + if not self.ref_in_actor: + ref_log_prob = self.ref_policy_wg.compute_ref_log_prob(batch) + else: + ref_log_prob = self.actor_rollout_wg.compute_ref_log_prob(batch) + batch = batch.union(ref_log_prob) + # compute values + if self.use_critic: + with marked_timer("values", timing_raw, color="cyan"): + values = self.critic_wg.compute_values(batch) + batch = batch.union(values) + with marked_timer("adv", timing_raw, color="brown"): + # we combine with rule-based rm + reward_extra_infos_dict: dict[str, list] + if self.config.reward_model.launch_reward_fn_async: + reward_tensor, reward_extra_infos_dict = ray.get(future_reward) + batch.batch["token_level_scores"] = reward_tensor + + if reward_extra_infos_dict: + batch.non_tensor_batch.update({k: np.array(v) for k, v in reward_extra_infos_dict.items()}) + + # compute rewards. apply_kl_penalty if available + if self.config.algorithm.use_kl_in_reward: + batch, kl_metrics = apply_kl_penalty( + batch, kl_ctrl=self.kl_ctrl_in_reward, kl_penalty=self.config.algorithm.kl_penalty + ) + metrics.update(kl_metrics) + else: + batch.batch["token_level_rewards"] = batch.batch["token_level_scores"] + + # compute advantages, executed on the driver process + + norm_adv_by_std_in_grpo = self.config.algorithm.get( + "norm_adv_by_std_in_grpo", True + ) # GRPO adv normalization factor + + batch = compute_advantage( + batch, + adv_estimator=self.config.algorithm.adv_estimator, + gamma=self.config.algorithm.gamma, + lam=self.config.algorithm.lam, + num_repeat=self.config.actor_rollout_ref.rollout.n, + norm_adv_by_std_in_grpo=norm_adv_by_std_in_grpo, + config=self.config.algorithm, + ) + # update critic + if self.use_critic: + with marked_timer("update_critic", timing_raw, color="pink"): + critic_output = self.critic_wg.update_critic(batch) + critic_output_metrics = reduce_metrics(critic_output.meta_info["metrics"]) + metrics.update(critic_output_metrics) + # implement critic warmup + if self.config.trainer.critic_warmup <= self.global_steps: + # update actor + with marked_timer("update_actor", timing_raw, color="red"): + batch.meta_info["multi_turn"] = self.config.actor_rollout_ref.rollout.multi_turn.enable + actor_output = self.actor_rollout_wg.update_actor(batch) + actor_output_metrics = reduce_metrics(actor_output.meta_info["metrics"]) + metrics.update(actor_output_metrics) + return batch, reward_extra_infos_dict + + def _log_rollout(self, batch, reward_extra_infos_dict, timing_raw): + """Log rollout generations if enabled""" + rollout_data_dir = self.config.trainer.get("rollout_data_dir", None) + if rollout_data_dir: + with marked_timer("dump_rollout_generations", timing_raw, color="green"): + inputs = self.tokenizer.batch_decode(batch.batch["prompts"], skip_special_tokens=True) + outputs = self.tokenizer.batch_decode(batch.batch["responses"], skip_special_tokens=True) + scores = batch.batch["token_level_scores"].sum(-1).cpu().tolist() + if "request_id" in batch.non_tensor_batch: + reward_extra_infos_dict.setdefault( + "request_id", + batch.non_tensor_batch["request_id"].tolist(), + ) + self._dump_generations( + inputs=inputs, + outputs=outputs, + scores=scores, + reward_extra_infos_dict=reward_extra_infos_dict, + dump_path=rollout_data_dir, + ) + + def _validate_metrics(self, is_last_step, last_val_metrics, metrics, timing_raw): + if ( + self.val_reward_fn is not None + and self.config.trainer.test_freq > 0 + and (is_last_step or self.global_steps % self.config.trainer.test_freq == 0) + ): + with marked_timer("testing", timing_raw, color="green"): + val_metrics: dict = self._validate() + if is_last_step: + last_val_metrics = val_metrics + metrics.update(val_metrics) + return last_val_metrics + + def _check_save_checkpoint(self, is_last_step, timing_raw): + # Check if the ESI (Elastic Server Instance)/training plan is close to expiration. + esi_close_to_expiration = should_save_ckpt_esi( + max_steps_duration=self.max_steps_duration, + redundant_time=self.config.trainer.esi_redundant_time, + ) + # Check if the conditions for saving a checkpoint are met. + # The conditions include a mandatory condition (1) and + # one of the following optional conditions (2/3/4): + # 1. The save frequency is set to a positive value. + # 2. It's the last training step. + # 3. The current step number is a multiple of the save frequency. + # 4. The ESI(Elastic Server Instance)/training plan is close to expiration. + if self.config.trainer.save_freq > 0 and ( + is_last_step or self.global_steps % self.config.trainer.save_freq == 0 or esi_close_to_expiration + ): + if esi_close_to_expiration: + print("Force saving checkpoint: ESI instance expiration approaching.") + with marked_timer("save_checkpoint", timing_raw, color="green"): + self._save_checkpoint() + + def _collect_metrics(self, batch, epoch, metrics, timing_raw): + steps_duration = timing_raw["step"] + self.max_steps_duration = max(self.max_steps_duration, steps_duration) + # training metrics + metrics.update( + { + "training/global_step": self.global_steps, + "training/epoch": epoch, + } + ) + # collect metrics + metrics.update(compute_data_metrics(batch=batch, use_critic=self.use_critic)) + metrics.update(compute_timing_metrics(batch=batch, timing_raw=timing_raw)) + # TODO: implement actual tflpo and theoretical tflpo + n_gpus = self.resource_pool_manager.get_n_gpus() + metrics.update(compute_throughout_metrics(batch=batch, timing_raw=timing_raw, n_gpus=n_gpus)) + + def _post_batch_processing(self, batch: DataProto): + # this is experimental and may be changed/removed in the future in favor of a general-purpose one + if isinstance(self.train_dataloader.sampler, AbstractCurriculumSampler): + self.train_dataloader.sampler.update(batch=batch) + + # this is experimental and may be changed/removed in the future + # in favor of a general-purpose data buffer pool + if hasattr(self.train_dataset, "on_batch_end"): + # The dataset may be changed after each training batch + self.train_dataset.on_batch_end(batch=batch) From 33ed01fe68dba9d715a4c67c8f680edd166b41c1 Mon Sep 17 00:00:00 2001 From: ArronHZG Date: Fri, 1 Aug 2025 11:48:34 +0800 Subject: [PATCH 015/182] refactor init worker --- recipe/fully_async_policy/fully_async_main.py | 1 - .../fully_async_rollouter.py | 129 ++++--- .../fully_async_policy/fully_async_trainer.py | 334 +++--------------- recipe/fully_async_policy/message_queue.py | 12 +- recipe/fully_async_policy/unittest/test_mq.py | 68 +--- recipe/one_step_off_policy/ray_trainer.py | 113 ++---- verl/trainer/ppo/ray_trainer.py | 76 +++- 7 files changed, 228 insertions(+), 505 deletions(-) diff --git a/recipe/fully_async_policy/fully_async_main.py b/recipe/fully_async_policy/fully_async_main.py index 3773d90d8d7..e8053e74647 100644 --- a/recipe/fully_async_policy/fully_async_main.py +++ b/recipe/fully_async_policy/fully_async_main.py @@ -343,7 +343,6 @@ def _run_training_loop(self): logger.info("Starting Rollouter in background...") rollouter_future = self.components["rollouter"].fit.remote() - time.sleep(2.0) trainer_future = self.components["trainer"].fit.remote() self._monitor_components() ray.get(rollouter_future) diff --git a/recipe/fully_async_policy/fully_async_rollouter.py b/recipe/fully_async_policy/fully_async_rollouter.py index 06380803aee..6b41d635013 100644 --- a/recipe/fully_async_policy/fully_async_rollouter.py +++ b/recipe/fully_async_policy/fully_async_rollouter.py @@ -28,7 +28,7 @@ from verl import DataProto from verl.single_controller.ray import RayClassWithInitArgs, RayWorkerGroup from verl.single_controller.ray.base import create_colocated_worker_cls -from verl.trainer.ppo.ray_trainer import ResourcePoolManager, Role, WorkerType +from verl.trainer.ppo.ray_trainer import ResourcePoolManager, Role, WorkerType, RayPPOTrainer from verl.utils.debug import marked_timer logger = logging.getLogger(__name__) @@ -116,7 +116,7 @@ def get_status(self) -> dict: @ray.remote -class FullyAsyncRollouter: +class FullyAsyncRollouter(RayPPOTrainer): """ 异步样本生成器,负责持续生成训练样本并放入MessageQueue 基于OneStepOffRayTrainer的成熟实现改进 @@ -130,23 +130,78 @@ def __init__( resource_pool_manager: ResourcePoolManager, ray_worker_group_cls: RayWorkerGroup = RayWorkerGroup, processor=None, - train_dataset: Dataset | None = None, + reward_fn=None, + val_reward_fn=None, + train_dataset: Optional[Dataset] = None, + val_dataset: Optional[Dataset] = None, collate_fn=None, - train_sampler: Sampler | None = None, - device_name="cuda", + train_sampler: Optional[Sampler] = None, + device_name=None, ): - self.config = config + """ + Initialize distributed PPO trainer with Ray backend. + Note that this trainer runs on the driver process on a single CPU/GPU node. + + Args: + config: Configuration object containing training parameters. + tokenizer: Tokenizer used for encoding and decoding text. + role_worker_mapping (dict[Role, WorkerType]): Mapping from roles to worker classes. + resource_pool_manager (ResourcePoolManager): Manager for Ray resource pools. + ray_worker_group_cls (RayWorkerGroup, optional): Class for Ray worker groups. Defaults to RayWorkerGroup. + processor: Optional data processor, used for multimodal data + reward_fn: Function for computing rewards during training. + val_reward_fn: Function for computing rewards during validation. + train_dataset (Optional[Dataset], optional): Training dataset. Defaults to None. + val_dataset (Optional[Dataset], optional): Validation dataset. Defaults to None. + collate_fn: Function to collate data samples into batches. + train_sampler (Optional[Sampler], optional): Sampler for the training dataset. Defaults to None. + device_name (str, optional): Device name for training (e.g., "cuda", "cpu"). Defaults to None. + """ + + # Store the tokenizer for text processing self.tokenizer = tokenizer self.processor = processor + self.config = config + self.reward_fn = reward_fn + self.val_reward_fn = val_reward_fn + + self.hybrid_engine = config.actor_rollout_ref.hybrid_engine + assert not self.hybrid_engine + self.role_worker_mapping = role_worker_mapping self.resource_pool_manager = resource_pool_manager + self.use_reference_policy = Role.RefPolicy in role_worker_mapping + self.use_rm = Role.RewardModel in role_worker_mapping self.ray_worker_group_cls = ray_worker_group_cls - self.device_name = device_name + self.device_name = device_name if device_name else self.config.trainer.device + self.validation_generations_logger = ValidationGenerationsLogger( + project_name=self.config.trainer.project_name, + experiment_name=self.config.trainer.experiment_name, + ) - # 数据相关 - self.train_dataset = train_dataset - self.collate_fn = collate_fn - self.train_sampler = train_sampler + # if ref_in_actor is True, the reference policy will be actor without lora applied + self.ref_in_actor = config.actor_rollout_ref.model.get("lora_rank", 0) > 0 + + # define in-reward KL control + # kl loss control currently not suppoorted + if self.config.algorithm.use_kl_in_reward: + self.kl_ctrl_in_reward = core_algos.get_kl_controller(self.config.algorithm.kl_ctrl) + + if config.critic.enable is not None: + self.use_critic = bool(config.critic.enable) + elif self.config.algorithm.adv_estimator == AdvantageEstimator.GAE: + self.use_critic = True + else: + warnings.warn( + "Disabled critic as algorithm.adv_estimator != gae. " + "If it is not intended, please set critic.enable=True", + stacklevel=2, + ) + self.use_critic = False + + self._validate_config() + self._create_dataloader(train_dataset, val_dataset, collate_fn, train_sampler) + self.message_queue_client = None # Rollout控制 self.rollout_controller = RolloutController() @@ -180,10 +235,13 @@ def __init__( self.sync_in_progress = False self.sync_lock = threading.Lock() - # 异步rollout模式 - self.async_rollout_mode = config.actor_rollout_ref.rollout.mode == "async" + def set_message_queue_client(self, message_queue_client: MessageQueueClient): + """设置消息队列客户端""" + self.message_queue_client = message_queue_client - self._validate_config() + def _validate(self): + """执行验证 - 参考OneStepOffRayTrainer的验证逻辑""" + return None def _validate_config(self): """验证配置""" @@ -263,10 +321,6 @@ def _init_async_rollout_manager(self): logger.warning(f"Failed to initialize async rollout manager: {e}") self.async_rollout_mode = False - def set_message_queue_client(self, message_queue_client: MessageQueueClient): - """设置消息队列客户端""" - self.message_queue_client = message_queue_client - def set_parameter_synchronizer(self, param_synchronizer): """设置参数同步器""" self.param_synchronizer = param_synchronizer @@ -370,39 +424,14 @@ def _execute_parameter_sync(self, param_version: int) -> bool: logger.error(f"Parameter sync execution failed: {e}") return False - def _create_dataloader(self): - """创建数据加载器""" - from torch.utils.data import DataLoader - - if self.train_dataset is None: - raise ValueError("Training dataset not provided") - - return DataLoader( - self.train_dataset, - batch_size=self.config.data.train_batch_size, - sampler=self.train_sampler, - collate_fn=self.collate_fn, - num_workers=self.config.data.get("dataloader_num_workers", 0), - drop_last=True, - pin_memory=True, # 改进内存管理 - ) - def _create_continuous_iterator(self): - """创建连续的数据迭代器""" - dataloader = self._create_dataloader() - - epoch = 0 - while self.running: - try: - for batch_dict in dataloader: - if not self.running: - return - yield epoch, batch_dict - epoch += 1 - except Exception as e: - logger.error(f"Error in data iterator: {e}") - time.sleep(1.0) # 避免快速重试 - continue + """ + Create a continuous data iterator across epoch + """ + for epoch in range(self.config.trainer.total_epochs): + iterator = iter(self.train_dataloader) + for batch_dict in iterator: + yield epoch, batch_dict def _should_pause_generation(self) -> bool: """ diff --git a/recipe/fully_async_policy/fully_async_trainer.py b/recipe/fully_async_policy/fully_async_trainer.py index 9122a97c8fa..16354313e4e 100644 --- a/recipe/fully_async_policy/fully_async_trainer.py +++ b/recipe/fully_async_policy/fully_async_trainer.py @@ -16,6 +16,7 @@ import time import warnings from pprint import pprint +from typing import Optional import numpy as np import ray @@ -60,20 +61,20 @@ class FullyAsyncTrainer(RayPPOTrainer): """ def __init__( - self, - config, - tokenizer, - role_worker_mapping: dict[Role, WorkerType], - resource_pool_manager: ResourcePoolManager, - ray_worker_group_cls: RayWorkerGroup = RayWorkerGroup, - processor=None, - reward_fn=None, - val_reward_fn=None, - train_dataset: Optional[Dataset] = None, - val_dataset: Optional[Dataset] = None, - collate_fn=None, - train_sampler: Optional[Sampler] = None, - device_name=None, + self, + config, + tokenizer, + role_worker_mapping: dict[Role, WorkerType], + resource_pool_manager: ResourcePoolManager, + ray_worker_group_cls: RayWorkerGroup = RayWorkerGroup, + processor=None, + reward_fn=None, + val_reward_fn=None, + train_dataset: Optional[Dataset] = None, + val_dataset: Optional[Dataset] = None, + collate_fn=None, + train_sampler: Optional[Sampler] = None, + device_name=None, ): """ Initialize distributed PPO trainer with Ray backend. @@ -138,6 +139,7 @@ def __init__( self._validate_config() self._create_dataloader(train_dataset, val_dataset, collate_fn, train_sampler) + self.message_queue_client = None def set_message_queue_client(self, message_queue_client: MessageQueueClient): """设置消息队列客户端""" @@ -234,63 +236,6 @@ def init_workers(self): logger.info("FullyAsyncTrainer workers initialized successfully") - def _sync_parameters_to_rollouter(self): - """同步参数到Rollouter - 改进的同步机制""" - if self.rollouter_actor is None: - logger.warning("Rollouter actor not set, skipping parameter sync") - return - - self.current_param_version += 1 - - try: - # 通知MessageQueue更新参数版本 - self.message_queue_client.update_param_version(self.current_param_version) - - # 同步参数到Rollouter - sync_future = self.rollouter_actor.update_rollout_weights.remote(self.current_param_version) - ray.get(sync_future) - - self.param_sync_count += 1 - logger.info(f"Parameter sync completed, version: {self.current_param_version}") - - except Exception as e: - logger.error(f"Failed to sync parameters: {e}") - self.current_param_version -= 1 # 回滚版本号 - raise - - def _process_batch_samples(self, batch_samples: list[QueueSample]) -> DataProto: - """处理从队列获取的batch样本 - 改进的批处理逻辑""" - if not batch_samples: - raise ValueError("Empty batch samples") - - if len(batch_samples) == 1: - return batch_samples[0].data - - # 合并多个batch - 使用DataProto的concat方法 - try: - all_batches = [sample.data for sample in batch_samples] - merged_batch = DataProto.concat(all_batches) - logger.debug(f"Successfully merged {len(batch_samples)} batches") - return merged_batch - except Exception as e: - logger.error(f"Failed to merge batch samples: {e}") - raise - - def _compute_sample_freshness_metrics(self, batch_samples: list[QueueSample]) -> dict: - """计算样本新鲜度指标""" - sample_ages = [self.current_param_version - sample.param_version for sample in batch_samples] - current_time = time.time() - sample_latencies = [current_time - sample.timestamp for sample in batch_samples] - - return { - "freshness/avg_sample_age": np.mean(sample_ages), - "freshness/max_sample_age": max(sample_ages), - "freshness/min_sample_age": min(sample_ages), - "freshness/avg_sample_latency": np.mean(sample_latencies), - "freshness/max_sample_latency": max(sample_latencies), - "freshness/stale_samples_ratio": sum(1 for age in sample_ages if age > 1) / len(sample_ages), - } - def fit(self): """ The training loop of PPO. @@ -298,6 +243,11 @@ def fit(self): to construct the PPO dataflow. The light-weight advantage computation is done on the driver process. """ + logger.info("Starting Trainer...") + + if self.message_queue_client is None: + raise ValueError("MessageQueue client not set. Call set_message_queue_client() first.") + from omegaconf import OmegaConf from verl.utils.tracking import Tracking @@ -330,6 +280,7 @@ def fit(self): # we start from step 1 self.global_steps += 1 last_val_metrics = None + self.max_steps_duration = 0 # across epoch iterator continuous_iterator = self._create_continuous_iterator() @@ -346,8 +297,7 @@ def fit(self): if self.config.trainer.profile_steps is not None else False ) - with marked_timer("start_profile", timing_raw): - self._start_profiling(do_profile) + self._start_profiling(do_profile, timing_raw) is_last_step = self.global_steps >= self.total_training_steps @@ -363,216 +313,15 @@ def fit(self): if not is_last_step: batch_data_future = self._async_gen_next_batch(continuous_iterator) - batch.non_tensor_batch["uid"] = np.array( - [str(uuid.uuid4()) for _ in range(len(batch.batch))], dtype=object - ) - # repeat to align with repeated responses in rollout - batch = batch.repeat(repeat_times=self.config.actor_rollout_ref.rollout.n, interleave=True) - batch = batch.union(gen_batch_output) - - if "response_mask" not in batch.batch.keys(): - batch.batch["response_mask"] = compute_response_mask(batch) - # Balance the number of valid tokens across DP ranks. - # NOTE: This usually changes the order of data in the `batch`, - # which won't affect the advantage calculation (since it's based on uid), - # but might affect the loss calculation (due to the change of mini-batching). - # TODO: Decouple the DP balancing and mini-batching. - if self.config.trainer.balance_batch: - self._balance_batch(batch, metrics=metrics) - - # compute global_valid tokens - batch.meta_info["global_token_num"] = torch.sum(batch.batch["attention_mask"], dim=-1).tolist() - - with marked_timer("reward", timing_raw, color="yellow"): - # compute reward model score - if self.use_rm: - reward_tensor = self.rm_wg.compute_rm_score(batch) - batch = batch.union(reward_tensor) - - if self.config.reward_model.launch_reward_fn_async: - future_reward = compute_reward_async.remote(batch, self.config, self.tokenizer) - else: - reward_tensor, reward_extra_infos_dict = compute_reward(batch, self.reward_fn) - - # recompute old_log_probs - with marked_timer("old_log_prob", timing_raw, color="blue"): - old_log_prob = self.actor_wg.compute_log_prob(batch) - entropys = old_log_prob.batch["entropys"] - response_masks = batch.batch["response_mask"] - loss_agg_mode = self.config.actor_rollout_ref.actor.loss_agg_mode - entropy_agg = agg_loss(loss_mat=entropys, loss_mask=response_masks, loss_agg_mode=loss_agg_mode) - old_log_prob_metrics = {"actor/entropy": entropy_agg.detach().item()} - metrics.update(old_log_prob_metrics) - old_log_prob.batch.pop("entropys") - batch = batch.union(old_log_prob) - - if "rollout_log_probs" in batch.batch.keys(): - # TODO: we may want to add diff of probs too. - rollout_old_log_probs = batch.batch["rollout_log_probs"] - actor_old_log_probs = batch.batch["old_log_probs"] - attention_mask = batch.batch["attention_mask"] - responses = batch.batch["responses"] - response_length = responses.size(1) - response_mask = attention_mask[:, -response_length:] - - rollout_probs = torch.exp(rollout_old_log_probs) - actor_probs = torch.exp(actor_old_log_probs) - rollout_probs_diff = torch.abs(rollout_probs - actor_probs) - rollout_probs_diff = torch.masked_select(rollout_probs_diff, response_mask.bool()) - rollout_probs_diff_max = torch.max(rollout_probs_diff) - rollout_probs_diff_mean = torch.mean(rollout_probs_diff) - rollout_probs_diff_std = torch.std(rollout_probs_diff) - metrics.update( - { - "training/rollout_probs_diff_max": rollout_probs_diff_max.detach().item(), - "training/rollout_probs_diff_mean": rollout_probs_diff_mean.detach().item(), - "training/rollout_probs_diff_std": rollout_probs_diff_std.detach().item(), - } - ) - - if self.use_reference_policy: - # compute reference log_prob - with marked_timer("ref", timing_raw, color="olive"): - if not self.ref_in_actor: - ref_log_prob = self.ref_policy_wg.compute_ref_log_prob(batch) - else: - ref_log_prob = self.actor_wg.compute_ref_log_prob(batch) - batch = batch.union(ref_log_prob) - - # compute values - if self.use_critic: - with marked_timer("values", timing_raw, color="cyan"): - values = self.critic_wg.compute_values(batch) - batch = batch.union(values) - - with marked_timer("adv", timing_raw, color="brown"): - # we combine with rule-based rm - reward_extra_infos_dict: dict[str, list] - if self.config.reward_model.launch_reward_fn_async: - reward_tensor, reward_extra_infos_dict = ray.get(future_reward) - batch.batch["token_level_scores"] = reward_tensor - - if reward_extra_infos_dict: - batch.non_tensor_batch.update({k: np.array(v) for k, v in reward_extra_infos_dict.items()}) - - # compute rewards. apply_kl_penalty if available - if self.config.algorithm.use_kl_in_reward: - batch, kl_metrics = apply_kl_penalty( - batch, kl_ctrl=self.kl_ctrl_in_reward, kl_penalty=self.config.algorithm.kl_penalty - ) - metrics.update(kl_metrics) - else: - batch.batch["token_level_rewards"] = batch.batch["token_level_scores"] - - # compute advantages, executed on the driver process - - norm_adv_by_std_in_grpo = self.config.algorithm.get( - "norm_adv_by_std_in_grpo", True - ) # GRPO adv normalization factor - - batch = compute_advantage( - batch, - adv_estimator=self.config.algorithm.adv_estimator, - gamma=self.config.algorithm.gamma, - lam=self.config.algorithm.lam, - num_repeat=self.config.actor_rollout_ref.rollout.n, - norm_adv_by_std_in_grpo=norm_adv_by_std_in_grpo, - config=self.config.algorithm, - ) - - # update critic - if self.use_critic: - with marked_timer("update_critic", timing_raw, color="pink"): - critic_output = self.critic_wg.update_critic(batch) - critic_output_metrics = reduce_metrics(critic_output.meta_info["metrics"]) - metrics.update(critic_output_metrics) - - # implement critic warmup - if self.config.trainer.critic_warmup <= self.global_steps: - # update actor - with marked_timer("update_actor", timing_raw, color="red"): - batch.meta_info["multi_turn"] = self.config.actor_rollout_ref.rollout.multi_turn.enable - actor_output = self.actor_wg.update_actor(batch) - actor_output_metrics = reduce_metrics(actor_output.meta_info["metrics"]) - metrics.update(actor_output_metrics) - - # Log rollout generations if enabled - rollout_data_dir = self.config.trainer.get("rollout_data_dir", None) - if rollout_data_dir: - with marked_timer("dump_rollout_generations", timing_raw, color="green"): - inputs = self.tokenizer.batch_decode(batch.batch["prompts"], skip_special_tokens=True) - outputs = self.tokenizer.batch_decode(batch.batch["responses"], skip_special_tokens=True) - scores = batch.batch["token_level_scores"].sum(-1).cpu().tolist() - if "request_id" in batch.non_tensor_batch: - reward_extra_infos_dict.setdefault( - "request_id", - batch.non_tensor_batch["request_id"].tolist(), - ) - self._dump_generations( - inputs=inputs, - outputs=outputs, - scores=scores, - reward_extra_infos_dict=reward_extra_infos_dict, - dump_path=rollout_data_dir, - ) - - # validate - if ( - self.val_reward_fn is not None - and self.config.trainer.test_freq > 0 - and (is_last_step or self.global_steps % self.config.trainer.test_freq == 0) - ): - with marked_timer("testing", timing_raw, color="green"): - val_metrics: dict = self._validate() - if is_last_step: - last_val_metrics = val_metrics - metrics.update(val_metrics) - - # Check if the ESI (Elastic Server Instance)/training plan is close to expiration. - esi_close_to_expiration = should_save_ckpt_esi( - max_steps_duration=self.max_steps_duration, - redundant_time=self.config.trainer.esi_redundant_time, - ) - # Check if the conditions for saving a checkpoint are met. - # The conditions include a mandatory condition (1) and - # one of the following optional conditions (2/3/4): - # 1. The save frequency is set to a positive value. - # 2. It's the last training step. - # 3. The current step number is a multiple of the save frequency. - # 4. The ESI(Elastic Server Instance)/training plan is close to expiration. q - if self.config.trainer.save_freq > 0 and ( - is_last_step - or self.global_steps % self.config.trainer.save_freq == 0 - or esi_close_to_expiration - ): - if esi_close_to_expiration: - print("Force saving checkpoint: ESI instance expiration approaching.") - with marked_timer("save_checkpoint", timing_raw, color="green"): - self._save_checkpoint() - - with marked_timer("stop_profile", timing_raw): - self._stop_profiling(do_profile) - - steps_duration = timing_raw["step"] - self.max_steps_duration = max(self.max_steps_duration, steps_duration) - - # training metrics - metrics.update( - { - "training/global_step": self.global_steps, - "training/epoch": epoch, - } - ) - # collect metrics - metrics.update(compute_data_metrics(batch=batch, use_critic=self.use_critic)) - metrics.update(compute_timing_metrics(batch=batch, timing_raw=timing_raw)) - # TODO: implement actual tflpo and theoretical tflpo - n_gpus = self.resource_pool_manager.get_n_gpus() - metrics.update(compute_throughout_metrics(batch=batch, timing_raw=timing_raw, n_gpus=n_gpus)) + batch = self._post_generate_batch(batch, gen_batch_output, metrics) + batch, reward_extra_infos_dict = self._process_batch_common(batch, metrics, timing_raw) + self._log_rollout(batch, reward_extra_infos_dict, timing_raw) + last_val_metrics = self._validate_metrics(is_last_step, last_val_metrics, metrics, timing_raw) + self._check_save_checkpoint(is_last_step, timing_raw) - # this is experimental and may be changed/removed in the future in favor of a general-purpose one - if isinstance(self.train_dataloader.sampler, AbstractCurriculumSampler): - self.train_dataloader.sampler.update(batch=batch) + self._stop_profiling(do_profile, timing_raw) + self._collect_metrics(batch, epoch, metrics, timing_raw) + self._post_batch_processing(batch) # TODO: make a canonical logger that supports various backend logger.log(data=metrics, step=self.global_steps) @@ -585,12 +334,6 @@ def fit(self): progress_bar.close() return - # this is experimental and may be changed/removed in the future - # in favor of a general-purpose data buffer pool - if hasattr(self.train_dataset, "on_batch_end"): - # The dataset may be changed after each training batch - self.train_dataset.on_batch_end(batch=batch) - def get_statistics(self) -> dict: """获取训练统计信息""" queue_stats = self.message_queue_client.get_statistics() if self.message_queue_client else {} @@ -605,3 +348,18 @@ def get_statistics(self) -> dict: "queue_total_consumed": queue_stats.get("total_consumed", 0), "queue_dropped_samples": queue_stats.get("dropped_samples", 0), } + + def _compute_sample_freshness_metrics(self, batch_samples: list[QueueSample]) -> dict: + """计算样本新鲜度指标""" + sample_ages = [self.current_param_version - sample.param_version for sample in batch_samples] + current_time = time.time() + sample_latencies = [current_time - sample.timestamp for sample in batch_samples] + + return { + "freshness/avg_sample_age": np.mean(sample_ages), + "freshness/max_sample_age": max(sample_ages), + "freshness/min_sample_age": min(sample_ages), + "freshness/avg_sample_latency": np.mean(sample_latencies), + "freshness/max_sample_latency": max(sample_latencies), + "freshness/stale_samples_ratio": sum(1 for age in sample_ages if age > 1) / len(sample_ages), + } diff --git a/recipe/fully_async_policy/message_queue.py b/recipe/fully_async_policy/message_queue.py index 5866dcfd4a9..8e686e9a471 100644 --- a/recipe/fully_async_policy/message_queue.py +++ b/recipe/fully_async_policy/message_queue.py @@ -76,8 +76,9 @@ def __init__(self, config: DictConfig, max_queue_size: int = 1000): "staleness_threshold={self.staleness_threshold}" ) - def put_samples(self, epoch: int, samples: List[Any], param_version: int, - rollout_metadata_list: List[dict[str, Any]] = None) -> bool: + def put_samples( + self, epoch: int, samples: List[Any], param_version: int, rollout_metadata_list: List[dict[str, Any]] = None + ) -> bool: """ 放入一个batch样本到队列 @@ -103,8 +104,7 @@ def put_samples(self, epoch: int, samples: List[Any], param_version: int, rollout_metadata_list = [{}] * len(samples) if len(rollout_metadata_list) != len(samples): - logger.warning( - f"len(rollout_metadata_list):{len(rollout_metadata_list)} != len(samples:{len(samples)}") + logger.warning(f"len(rollout_metadata_list):{len(rollout_metadata_list)} != len(samples:{len(samples)}") return False for sample, meta in zip(samples, rollout_metadata_list): @@ -237,7 +237,9 @@ class MessageQueueClient: def __init__(self, queue_actor: Any): self.queue_actor = queue_actor - def put_batch(self, epoch: int, batch: List[Any], param_version: int, rollout_metadata_list: List[dict[str, Any]] = None) -> bool: + def put_batch( + self, epoch: int, batch: List[Any], param_version: int, rollout_metadata_list: List[dict[str, Any]] = None + ) -> bool: """放入batch到队列""" return ray.get(self.queue_actor.put_samples.remote(epoch, batch, param_version, rollout_metadata_list)) diff --git a/recipe/fully_async_policy/unittest/test_mq.py b/recipe/fully_async_policy/unittest/test_mq.py index dbc29c3e9ce..36172d02640 100644 --- a/recipe/fully_async_policy/unittest/test_mq.py +++ b/recipe/fully_async_policy/unittest/test_mq.py @@ -67,10 +67,7 @@ def test_put_samples_success(self, message_queue_client, mock_data_proto): metadata_list = [{"test": "data1"}, {"test": "data2"}] result = message_queue_client.put_batch( - epoch=1, - batch=samples, - param_version=1, - rollout_metadata_list=metadata_list + epoch=1, batch=samples, param_version=1, rollout_metadata_list=metadata_list ) assert result is True @@ -88,12 +85,7 @@ def test_put_samples_without_metadata(self, message_queue_client, mock_data_prot """测试不提供metadata时的处理""" samples = [mock_data_proto, mock_data_proto] - result = message_queue_client.put_batch( - epoch=1, - batch=samples, - param_version=1, - rollout_metadata_list=None - ) + result = message_queue_client.put_batch(epoch=1, batch=samples, param_version=1, rollout_metadata_list=None) assert result is True queue_size = message_queue_client.get_queue_size() @@ -105,10 +97,7 @@ def test_put_samples_metadata_mismatch(self, message_queue_client, mock_data_pro metadata_list = [{"test": "data1"}] # 长度不匹配 result = message_queue_client.put_batch( - epoch=1, - batch=samples, - param_version=1, - rollout_metadata_list=metadata_list + epoch=1, batch=samples, param_version=1, rollout_metadata_list=metadata_list ) assert result is False # 应该失败 @@ -126,7 +115,7 @@ def test_put_samples_staleness_check(self, message_queue_client, mock_data_proto epoch=1, batch=samples, param_version=2, # 5-2=3, 达到阈值 - rollout_metadata_list=None + rollout_metadata_list=None, ) assert result is False @@ -140,12 +129,7 @@ def test_put_samples_queue_overflow(self, message_queue_client, mock_data_proto) # 填满队列(最大容量10) for i in range(6): # 每次放入2个,总共12个,超过最大容量10 samples = [mock_data_proto, mock_data_proto] - message_queue_client.put_batch( - epoch=1, - batch=samples, - param_version=1, - rollout_metadata_list=None - ) + message_queue_client.put_batch(epoch=1, batch=samples, param_version=1, rollout_metadata_list=None) # 队列大小应该保持在最大值 queue_size = message_queue_client.get_queue_size() @@ -160,12 +144,7 @@ def test_get_samples_success(self, message_queue_client, mock_data_proto): # 先放入一些samples samples = [mock_data_proto, mock_data_proto, mock_data_proto] metadata_list = [{"index": 0}, {"index": 1}, {"index": 2}] - message_queue_client.put_batch( - epoch=1, - batch=samples, - param_version=1, - rollout_metadata_list=metadata_list - ) + message_queue_client.put_batch(epoch=1, batch=samples, param_version=1, rollout_metadata_list=metadata_list) # 获取2个samples retrieved_samples = message_queue_client.get_batch(min_batch_count=2) @@ -194,12 +173,7 @@ def get_samples(): def put_samples_later(): time.sleep(0.5) # 延迟放入 samples = [mock_data_proto, mock_data_proto] - message_queue_client.put_batch( - epoch=1, - batch=samples, - param_version=1, - rollout_metadata_list=None - ) + message_queue_client.put_batch(epoch=1, batch=samples, param_version=1, rollout_metadata_list=None) # 启动消费者线程 consumer_thread = threading.Thread(target=get_samples) @@ -225,12 +199,7 @@ def test_clear_queue(self, message_queue_client, mock_data_proto): """测试清空队列""" # 先添加一些样本 samples = [mock_data_proto, mock_data_proto, mock_data_proto] - message_queue_client.put_batch( - epoch=1, - batch=samples, - param_version=1, - rollout_metadata_list=None - ) + message_queue_client.put_batch(epoch=1, batch=samples, param_version=1, rollout_metadata_list=None) # 清空队列 message_queue_client.clear_queue() @@ -244,12 +213,7 @@ def test_get_queue_size(self, message_queue_client, mock_data_proto): assert message_queue_client.get_queue_size() == 0 samples = [mock_data_proto] - message_queue_client.put_batch( - epoch=1, - batch=samples, - param_version=1, - rollout_metadata_list=None - ) + message_queue_client.put_batch(epoch=1, batch=samples, param_version=1, rollout_metadata_list=None) assert message_queue_client.get_queue_size() == 1 def test_get_statistics(self, message_queue_client): @@ -274,12 +238,7 @@ def test_get_memory_usage(self, message_queue_client, mock_data_proto): """测试获取内存使用统计""" # 添加一些样本 samples = [mock_data_proto, mock_data_proto] - message_queue_client.put_batch( - epoch=1, - batch=samples, - param_version=1, - rollout_metadata_list=None - ) + message_queue_client.put_batch(epoch=1, batch=samples, param_version=1, rollout_metadata_list=None) memory_stats = message_queue_client.get_memory_usage() @@ -328,12 +287,7 @@ def test_concurrent_put_get(self, mock_data_proto): def producer(): for i in range(50): samples = [mock_data_proto, mock_data_proto] - result = client.put_batch( - epoch=i, - batch=samples, - param_version=1, - rollout_metadata_list=None - ) + result = client.put_batch(epoch=i, batch=samples, param_version=1, rollout_metadata_list=None) results.append(("put", result)) time.sleep(0.1) diff --git a/recipe/one_step_off_policy/ray_trainer.py b/recipe/one_step_off_policy/ray_trainer.py index c1687561d01..c5b7a71225e 100644 --- a/recipe/one_step_off_policy/ray_trainer.py +++ b/recipe/one_step_off_policy/ray_trainer.py @@ -27,7 +27,6 @@ from tqdm import tqdm from verl.single_controller.ray import RayClassWithInitArgs, RayWorkerGroup -from verl.single_controller.ray.base import create_colocated_worker_cls from verl.trainer.ppo import core_algos from verl.trainer.ppo.core_algos import AdvantageEstimator from verl.trainer.ppo.ray_trainer import ( @@ -163,94 +162,35 @@ def _validate(self): self.actor_rollout_wg = self.actor_wg return ret - def init_workers(self): - """Initialize distributed training workers using Ray backend. - - Creates: - 1. Ray resource pools from configuration - 2. Worker groups for each role (actor, critic, etc.) - """ - self.resource_pool_manager.create_resource_pool() - - self.resource_pool_to_cls = {pool: {} for pool in self.resource_pool_manager.resource_pool_dict.values()} - + def _create_actor_rollout_classes(self): # create actor and rollout - for role, role_name in [(Role.Actor, "actor"), (Role.Rollout, "rollout")]: - resource_pool = self.resource_pool_manager.get_resource_pool(role) - role_cls = RayClassWithInitArgs( - cls=self.role_worker_mapping[role], - config=self.config.actor_rollout_ref, - role=role_name, - ) - self.resource_pool_to_cls[resource_pool][role_name] = role_cls - - # create critic - if self.use_critic: - resource_pool = self.resource_pool_manager.get_resource_pool(Role.Critic) - critic_cls = RayClassWithInitArgs(cls=self.role_worker_mapping[Role.Critic], config=self.config.critic) - self.resource_pool_to_cls[resource_pool]["critic"] = critic_cls - - # create reference policy if needed - if self.use_reference_policy: - resource_pool = self.resource_pool_manager.get_resource_pool(Role.RefPolicy) - ref_policy_cls = RayClassWithInitArgs( - self.role_worker_mapping[Role.RefPolicy], - config=self.config.actor_rollout_ref, - role="ref", - profile_option=self.config.trainer.npu_profile.options, - ) - self.resource_pool_to_cls[resource_pool]["ref"] = ref_policy_cls - - # create a reward model if reward_fn is None - if self.use_rm: - # we create a RM here - resource_pool = self.resource_pool_manager.get_resource_pool(Role.RewardModel) - rm_cls = RayClassWithInitArgs(self.role_worker_mapping[Role.RewardModel], config=self.config.reward_model) - self.resource_pool_to_cls[resource_pool]["rm"] = rm_cls - - # initialize WorkerGroup - # NOTE: if you want to use a different resource pool for each role, which can support different parallel size, - # you should not use `create_colocated_worker_cls`. - # Instead, directly pass different resource pool to different worker groups. - # See https://github.com/volcengine/verl/blob/master/examples/ray/tutorial.ipynb for more information. - all_wg = {} - wg_kwargs = {} # Setting up kwargs for RayWorkerGroup - if OmegaConf.select(self.config.trainer, "ray_wait_register_center_timeout") is not None: - wg_kwargs["ray_wait_register_center_timeout"] = self.config.trainer.ray_wait_register_center_timeout - if OmegaConf.select(self.config.trainer, "profile_steps") is not None: - wg_kwargs["profile_steps"] = OmegaConf.select(self.config.trainer, "profile_steps") - assert OmegaConf.select(self.config.trainer, "worker_nsight_options") is not None, ( - "worker_nsight_options must be set when profile_steps is set" - ) - wg_kwargs["worker_nsight_options"] = OmegaConf.to_container( - OmegaConf.select(self.config.trainer, "worker_nsight_options") - ) - - for resource_pool, class_dict in self.resource_pool_to_cls.items(): - worker_dict_cls = create_colocated_worker_cls(class_dict=class_dict) - wg_dict = self.ray_worker_group_cls( - resource_pool=resource_pool, - ray_cls_with_init=worker_dict_cls, - device_name=self.device_name, - **wg_kwargs, - ) - spawn_wg = wg_dict.spawn(prefix_set=class_dict.keys()) - all_wg.update(spawn_wg) + if not self.hybrid_engine: + for role in [Role.Actor, Role.Rollout]: + resource_pool = self.resource_pool_manager.get_resource_pool(role) + role_cls = RayClassWithInitArgs( + cls=self.role_worker_mapping[role], + config=self.config.actor_rollout_ref, + role=str(role), + ) + self.resource_pool_to_cls[resource_pool][str(role)] = role_cls + else: + raise NotImplementedError + def _init_models(self): if self.use_critic: - self.critic_wg = all_wg["critic"] + self.critic_wg = self.all_wg[str(Role.Critic)] self.critic_wg.init_model() if self.use_reference_policy and not self.ref_in_actor: - self.ref_policy_wg = all_wg["ref"] + self.ref_policy_wg = self.all_wg[str(Role.RefPolicy)] self.ref_policy_wg.init_model() if self.use_rm: - self.rm_wg = all_wg["rm"] + self.rm_wg = self.all_wg[str(Role.RewardModel)] self.rm_wg.init_model() - self.actor_wg = all_wg["actor"] - self.rollout_wg = all_wg["rollout"] + self.actor_wg = self.all_wg[str(Role.Actor)] + self.rollout_wg = self.all_wg[str(Role.Rollout)] self.actor_wg.init_model() self.rollout_wg.init_model() self.actor_rollout_wg = self.actor_wg # to be compatible with the functions that not be modified @@ -268,21 +208,9 @@ def init_workers(self): ) self.sync_rollout_weights() - # create async rollout manager and request scheduler - self.async_rollout_mode = False - if self.config.actor_rollout_ref.rollout.mode == "async": - from verl.workers.rollout.async_server import AsyncLLMServerManager - - self.async_rollout_mode = True - self.async_rollout_manager = AsyncLLMServerManager( - config=self.config, - worker_group=self.rollout_wg, - ) - def sync_rollout_weights(self): - if not self.hybrid_engine: - self.actor_wg.sync_rollout_weights() - ray.get(self.rollout_wg.sync_rollout_weights()) + self.actor_wg.sync_rollout_weights() + ray.get(self.rollout_wg.sync_rollout_weights()) def _create_continuous_iterator(self): """ @@ -318,7 +246,6 @@ def fit(self): to construct the PPO dataflow. The light-weight advantage computation is done on the driver process. """ - from omegaconf import OmegaConf from verl.utils.tracking import Tracking diff --git a/verl/trainer/ppo/ray_trainer.py b/verl/trainer/ppo/ray_trainer.py index 49334db6bcd..26150cc631d 100644 --- a/verl/trainer/ppo/ray_trainer.py +++ b/verl/trainer/ppo/ray_trainer.py @@ -79,6 +79,40 @@ class Role(Enum): RewardModel = 5 ActorRolloutRef = 6 + def __str__(self): + """返回与代码中一致的字符串表示""" + return self._get_role_string() + + def _get_role_string(self): + """获取角色对应的字符串名称""" + role_mapping = { + Role.Actor: "actor", + Role.Rollout: "rollout", + Role.ActorRollout: "actor_rollout", + Role.Critic: "critic", + Role.RefPolicy: "ref", + Role.RewardModel: "rm", + Role.ActorRolloutRef: "actor_rollout_ref", + } + return role_mapping.get(self, self.name.lower()) + + @classmethod + def from_string(cls, name: str): + """从字符串创建Role实例""" + string_mapping = { + "actor": cls.Actor, + "rollout": cls.Rollout, + "actor_rollout": cls.ActorRollout, + "critic": cls.Critic, + "ref": cls.RefPolicy, + "rm": cls.RewardModel, + "actor_rollout_ref": cls.ActorRolloutRef, + } + role = string_mapping.get(name.lower()) + if role is None: + raise ValueError(f"No Role found for string: {name}") + return role + @dataclass class ResourcePoolManager: @@ -776,48 +810,65 @@ def init_workers(self): 1. Ray resource pools from configuration 2. Worker groups for each role (actor, critic, etc.) """ - self.resource_pool_manager.create_resource_pool() + self._init_resource_pools() + self._create_worker_classes() + self._init_worker_groups() + self._init_models() + self._init_async_rollout_manager() + def _init_resource_pools(self): + self.resource_pool_manager.create_resource_pool() self.resource_pool_to_cls = {pool: {} for pool in self.resource_pool_manager.resource_pool_dict.values()} + def _create_worker_classes(self): + self._create_actor_rollout_classes() + self._create_critic_class() + self._create_reference_policy_class() + self._create_reward_model_class() + + def _create_actor_rollout_classes(self): # create actor and rollout if self.hybrid_engine: resource_pool = self.resource_pool_manager.get_resource_pool(Role.ActorRollout) actor_rollout_cls = RayClassWithInitArgs( cls=self.role_worker_mapping[Role.ActorRollout], config=self.config.actor_rollout_ref, - role="actor_rollout", + role=str(Role.ActorRollout), profile_option=self.config.trainer.npu_profile.options, ) - self.resource_pool_to_cls[resource_pool]["actor_rollout"] = actor_rollout_cls + self.resource_pool_to_cls[resource_pool][str(Role.ActorRollout)] = actor_rollout_cls else: raise NotImplementedError + def _create_critic_class(self): # create critic if self.use_critic: resource_pool = self.resource_pool_manager.get_resource_pool(Role.Critic) critic_cfg = omega_conf_to_dataclass(self.config.critic) critic_cls = RayClassWithInitArgs(cls=self.role_worker_mapping[Role.Critic], config=critic_cfg) - self.resource_pool_to_cls[resource_pool]["critic"] = critic_cls + self.resource_pool_to_cls[resource_pool][str(Role.Critic)] = critic_cls + def _create_reference_policy_class(self): # create reference policy if needed if self.use_reference_policy: resource_pool = self.resource_pool_manager.get_resource_pool(Role.RefPolicy) ref_policy_cls = RayClassWithInitArgs( self.role_worker_mapping[Role.RefPolicy], config=self.config.actor_rollout_ref, - role="ref", + role=str(Role.RefPolicy), profile_option=self.config.trainer.npu_profile.options, ) - self.resource_pool_to_cls[resource_pool]["ref"] = ref_policy_cls + self.resource_pool_to_cls[resource_pool][str(Role.RefPolicy)] = ref_policy_cls + def _create_reward_model_class(self): # create a reward model if reward_fn is None if self.use_rm: # we create a RM here resource_pool = self.resource_pool_manager.get_resource_pool(Role.RewardModel) rm_cls = RayClassWithInitArgs(self.role_worker_mapping[Role.RewardModel], config=self.config.reward_model) - self.resource_pool_to_cls[resource_pool]["rm"] = rm_cls + self.resource_pool_to_cls[resource_pool][str(Role.RewardModel)] = rm_cls + def _init_worker_groups(self): # initialize WorkerGroup # NOTE: if you want to use a different resource pool for each role, which can support different parallel size, # you should not use `create_colocated_worker_cls`. @@ -846,23 +897,26 @@ def init_workers(self): ) spawn_wg = wg_dict.spawn(prefix_set=class_dict.keys()) all_wg.update(spawn_wg) + self.all_wg = all_wg + def _init_models(self): if self.use_critic: - self.critic_wg = all_wg["critic"] + self.critic_wg = self.all_wg[str(Role.Critic)] self.critic_wg.init_model() if self.use_reference_policy and not self.ref_in_actor: - self.ref_policy_wg = all_wg["ref"] + self.ref_policy_wg = self.all_wg[str(Role.RefPolicy)] self.ref_policy_wg.init_model() if self.use_rm: - self.rm_wg = all_wg["rm"] + self.rm_wg = self.all_wg[str(Role.RewardModel)] self.rm_wg.init_model() # we should create rollout at the end so that vllm can have a better estimation of kv cache memory - self.actor_rollout_wg = all_wg["actor_rollout"] + self.actor_rollout_wg = self.all_wg[Role.ActorRollout] self.actor_rollout_wg.init_model() + def _init_async_rollout_manager(self): # create async rollout manager and request scheduler self.async_rollout_mode = False if self.config.actor_rollout_ref.rollout.mode == "async": From 9e8b596271574776088dc5e2e8778fbf955d62c0 Mon Sep 17 00:00:00 2001 From: ArronHZG Date: Fri, 1 Aug 2025 15:34:42 +0800 Subject: [PATCH 016/182] init worker --- .../fully_async_rollouter.py | 386 +++++++----------- .../fully_async_policy/fully_async_trainer.py | 23 +- recipe/fully_async_policy/message_queue.py | 8 +- recipe/fully_async_policy/unittest/test_mq.py | 4 +- recipe/one_step_off_policy/ray_trainer.py | 47 +-- 5 files changed, 182 insertions(+), 286 deletions(-) diff --git a/recipe/fully_async_policy/fully_async_rollouter.py b/recipe/fully_async_policy/fully_async_rollouter.py index 6b41d635013..f1248441594 100644 --- a/recipe/fully_async_policy/fully_async_rollouter.py +++ b/recipe/fully_async_policy/fully_async_rollouter.py @@ -11,25 +11,21 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. - import logging import threading import time -import uuid from concurrent.futures import ThreadPoolExecutor -from typing import Optional -import numpy as np import ray from omegaconf import OmegaConf from torch.utils.data import Dataset, Sampler +from tqdm import tqdm from recipe.fully_async_policy.message_queue import MessageQueueClient -from verl import DataProto from verl.single_controller.ray import RayClassWithInitArgs, RayWorkerGroup -from verl.single_controller.ray.base import create_colocated_worker_cls -from verl.trainer.ppo.ray_trainer import ResourcePoolManager, Role, WorkerType, RayPPOTrainer +from verl.trainer.ppo.ray_trainer import RayPPOTrainer, ResourcePoolManager, Role, WorkerType from verl.utils.debug import marked_timer +from verl.utils.tracking import ValidationGenerationsLogger logger = logging.getLogger(__name__) @@ -46,7 +42,7 @@ def __init__(self): self.lock = threading.RLock() self.pause_count = 0 - def pause(self, timeout: Optional[float] = None) -> bool: + def pause(self, timeout: float | None = None) -> bool: """ 暂停rollout @@ -115,7 +111,7 @@ def get_status(self) -> dict: } -@ray.remote +@ray.remote(num_cpus=10, max_concurrency=10) class FullyAsyncRollouter(RayPPOTrainer): """ 异步样本生成器,负责持续生成训练样本并放入MessageQueue @@ -123,20 +119,20 @@ class FullyAsyncRollouter(RayPPOTrainer): """ def __init__( - self, - config, - tokenizer, - role_worker_mapping: dict[Role, WorkerType], - resource_pool_manager: ResourcePoolManager, - ray_worker_group_cls: RayWorkerGroup = RayWorkerGroup, - processor=None, - reward_fn=None, - val_reward_fn=None, - train_dataset: Optional[Dataset] = None, - val_dataset: Optional[Dataset] = None, - collate_fn=None, - train_sampler: Optional[Sampler] = None, - device_name=None, + self, + config, + tokenizer, + role_worker_mapping: dict[Role, WorkerType], + resource_pool_manager: ResourcePoolManager, + ray_worker_group_cls: RayWorkerGroup = RayWorkerGroup, + processor=None, + reward_fn=None, + val_reward_fn=None, + train_dataset: Dataset | None = None, + val_dataset: Dataset | None = None, + collate_fn=None, + train_sampler: Sampler | None = None, + device_name=None, ): """ Initialize distributed PPO trainer with Ray backend. @@ -170,8 +166,6 @@ def __init__( self.role_worker_mapping = role_worker_mapping self.resource_pool_manager = resource_pool_manager - self.use_reference_policy = Role.RefPolicy in role_worker_mapping - self.use_rm = Role.RewardModel in role_worker_mapping self.ray_worker_group_cls = ray_worker_group_cls self.device_name = device_name if device_name else self.config.trainer.device self.validation_generations_logger = ValidationGenerationsLogger( @@ -179,25 +173,11 @@ def __init__( experiment_name=self.config.trainer.experiment_name, ) - # if ref_in_actor is True, the reference policy will be actor without lora applied - self.ref_in_actor = config.actor_rollout_ref.model.get("lora_rank", 0) > 0 - - # define in-reward KL control - # kl loss control currently not suppoorted - if self.config.algorithm.use_kl_in_reward: - self.kl_ctrl_in_reward = core_algos.get_kl_controller(self.config.algorithm.kl_ctrl) - - if config.critic.enable is not None: - self.use_critic = bool(config.critic.enable) - elif self.config.algorithm.adv_estimator == AdvantageEstimator.GAE: - self.use_critic = True - else: - warnings.warn( - "Disabled critic as algorithm.adv_estimator != gae. " - "If it is not intended, please set critic.enable=True", - stacklevel=2, - ) - self.use_critic = False + self.ref_in_actor = False + self.kl_ctrl_in_reward = False + self.use_critic = False + self.use_reference_policy = False + self.use_rm = False self._validate_config() self._create_dataloader(train_dataset, val_dataset, collate_fn, train_sampler) @@ -212,7 +192,6 @@ def __init__( self.staleness_threshold = async_config.get("staleness_threshold", 3) self.max_staleness_allowed = async_config.get("max_staleness_allowed", 5) self.generation_timeout = async_config.get("generation_timeout", 30.0) - self.batch_generation_interval = async_config.get("batch_generation_interval", 0.1) # 统计信息 self.total_generated_samples = 0 @@ -239,91 +218,50 @@ def set_message_queue_client(self, message_queue_client: MessageQueueClient): """设置消息队列客户端""" self.message_queue_client = message_queue_client - def _validate(self): - """执行验证 - 参考OneStepOffRayTrainer的验证逻辑""" - return None + def set_parameter_synchronizer(self, param_synchronizer): + """设置参数同步器""" + self.param_synchronizer = param_synchronizer def _validate_config(self): - """验证配置""" - required_configs = [ - "data.train_batch_size", - "actor_rollout_ref.rollout.n", - "async_training.staleness_threshold", - ] - - for config_path in required_configs: - if not OmegaConf.select(self.config, config_path): - logger.warning(f"Missing recommended config: {config_path}") - # 验证异步训练配置 if not hasattr(self.config, "async_training"): raise ValueError("Missing async_training configuration") def init_workers(self): - """初始化rollout workers - 参考OneStepOffRayTrainer的实现""" + """初始化rollout workers""" logger.info("Initializing Rollouter workers...") - - self.resource_pool_manager.create_resource_pool() - self.resource_pool_to_cls = {pool: {} for pool in self.resource_pool_manager.resource_pool_dict.values()} - - # 只创建rollout worker - resource_pool = self.resource_pool_manager.get_resource_pool(Role.Rollout) - role_cls = RayClassWithInitArgs( - cls=self.role_worker_mapping[Role.Rollout], - config=self.config.actor_rollout_ref, - role="rollout", - ) - self.resource_pool_to_cls[resource_pool]["rollout"] = role_cls - - # 初始化WorkerGroup - all_wg = {} - wg_kwargs = {} - if OmegaConf.select(self.config.trainer, "ray_wait_register_center_timeout") is not None: - wg_kwargs["ray_wait_register_center_timeout"] = self.config.trainer.ray_wait_register_center_timeout - if OmegaConf.select(self.config.trainer, "profile_steps") is not None: - wg_kwargs["profile_steps"] = OmegaConf.select(self.config.trainer, "profile_steps") - if OmegaConf.select(self.config.trainer, "worker_nsight_options") is not None: - wg_kwargs["worker_nsight_options"] = OmegaConf.to_container( - OmegaConf.select(self.config.trainer, "worker_nsight_options") - ) - - for resource_pool, class_dict in self.resource_pool_to_cls.items(): - worker_dict_cls = create_colocated_worker_cls(class_dict=class_dict) - wg_dict = self.ray_worker_group_cls( - resource_pool=resource_pool, - ray_cls_with_init=worker_dict_cls, - device_name=self.device_name, - **wg_kwargs, - ) - spawn_wg = wg_dict.spawn(prefix_set=class_dict.keys()) - all_wg.update(spawn_wg) + self._init_resource_pools() self.rollout_wg = all_wg["rollout"] self.rollout_wg.init_model() - # 初始化异步rollout管理器(如果需要) - if self.async_rollout_mode: - self._init_async_rollout_manager() + def _create_actor_rollout_classes(self): + # only create rollout + for role in [Role.Rollout]: + resource_pool = self.resource_pool_manager.get_resource_pool(role) + role_cls = RayClassWithInitArgs( + cls=self.role_worker_mapping[role], + config=self.config.actor_rollout_ref, + role=str(role), + ) + self.resource_pool_to_cls[resource_pool][str(role)] = role_cls - logger.info("Rollouter workers initialized successfully") + def _init_models(self): + self.rollout_wg = self.all_wg[str(Role.Rollout)] + self.rollout_wg.init_model() + self.actor_rollout_wg = self.rollout_wg def _init_async_rollout_manager(self): - """初始化异步rollout管理器""" - try: - from verl.workers.rollout.async_server import AsyncLLMServerManager + # create async rollout manager and request scheduler + self.async_rollout_mode = False + if self.config.actor_rollout_ref.rollout.mode == "async": + from verl.experimental.agent_loop import AgentLoopManager - self.async_rollout_manager = AsyncLLMServerManager( + self.async_rollout_mode = True + self.async_rollout_manager = AgentLoopManager( config=self.config, - worker_group=self.rollout_wg, + worker_group=self.actor_rollout_wg, ) - logger.info("Async rollout manager initialized") - except Exception as e: - logger.warning(f"Failed to initialize async rollout manager: {e}") - self.async_rollout_mode = False - - def set_parameter_synchronizer(self, param_synchronizer): - """设置参数同步器""" - self.param_synchronizer = param_synchronizer def update_rollout_weights(self, param_version: int) -> bool: """ @@ -468,143 +406,117 @@ def _should_pause_generation(self) -> bool: logger.error(f"Error checking pause conditions: {e}") return True # 出错时暂停生成 - def _generate_batch(self, epoch: int, batch_dict: dict) -> Optional[DataProto]: - """生成单个batch的样本 - 改进的生成逻辑""" - try: - batch = DataProto.from_single_dict(batch_dict) - - # 处理batch用于生成 - 参考OneStepOffRayTrainer的处理逻辑 - batch_keys_to_pop = ["input_ids", "attention_mask", "position_ids"] - non_tensor_batch_keys_to_pop = ["raw_prompt_ids"] - - # 处理多模态数据和其他可选字段 - optional_keys = ["multi_modal_data", "raw_prompt", "tools_kwargs", "interaction_kwargs"] - for key in optional_keys: - if key in batch.non_tensor_batch: - non_tensor_batch_keys_to_pop.append(key) - - gen_batch = batch.pop( - batch_keys=batch_keys_to_pop, - non_tensor_batch_keys=non_tensor_batch_keys_to_pop, - ) - - # 重复生成多个响应 - 参考OneStepOffRayTrainer - n_repeats = self.config.actor_rollout_ref.rollout.n - gen_batch = gen_batch.repeat(repeat_times=n_repeats, interleave=True) - - # 执行生成 - if self.async_rollout_mode: - # 异步生成 - gen_batch_output = ray.get( - self.rollout_wg.async_generate_sequences.remote(gen_batch), timeout=self.generation_timeout - ) - else: - # 同步生成 - gen_batch_output = ray.get( - self.rollout_wg.generate_sequences.remote(gen_batch), timeout=self.generation_timeout - ) - - # 添加UID - 确保每个样本有唯一标识 - batch.non_tensor_batch["uid"] = np.array([str(uuid.uuid4()) for _ in range(len(batch.batch))], dtype=object) - - # 重复原始batch以对齐生成的响应 - batch = batch.repeat(repeat_times=n_repeats, interleave=True) - - # 合并数据 - final_batch = batch.union(gen_batch_output) - - # 添加rollout metadata - final_batch.meta_info["rollout_param_version"] = self.current_param_version - final_batch.meta_info["generation_timestamp"] = time.time() - - return final_batch - - except Exception as e: - logger.error(f"Error generating batch: {e}") - self.generation_errors += 1 - return None - - def _generation_loop(self): - """主要的生成循环 - 改进的循环逻辑""" - logger.info("Starting generation loop...") + def fit(self): + """开始异步生成样本 - 改进的主运行逻辑 + 主要的生成循环 - try: - continuous_iterator = self._create_continuous_iterator() + 循环入口,需要 + 1. running 判断 + 4. 中断判断 + 3. 新鲜度判断 - for epoch, batch_dict in continuous_iterator: - if not self.running: - break - - # 等待如果被暂停 - if not self.rollout_controller.wait_if_paused(timeout=1.0): - if not self.running: - break - continue - - # 检查是否应该暂停生成 - if self._should_pause_generation(): - time.sleep(self.batch_generation_interval) - continue - - # 生成样本 - timing_raw = {} - with marked_timer("generate_batch", timing_raw): - generated_batch = self._generate_batch(epoch, batch_dict) - - if generated_batch is not None: - # 准备rollout metadata - rollout_metadata = { - "timing": timing_raw, - "generation_timestamp": time.time(), - "rollout_param_version": self.current_param_version, - "epoch": epoch, - } - - # 放入队列 - success = self.message_queue_client.put_samples( - epoch=epoch, - sample=generated_batch, - param_version=self.current_param_version, - rollout_metadata=rollout_metadata, - ) - - if success: - self.total_generated_samples += 1 - if self.total_generated_samples % 10 == 0: - logger.info( - f"Generated {self.total_generated_samples} batches, " - f"param_version={self.current_param_version}, " - f"errors={self.generation_errors}" - ) - else: - self.dropped_stale_samples += 1 - if self.dropped_stale_samples % 5 == 0: - logger.warning(f"Dropped stale samples: {self.dropped_stale_samples}") - - # 控制生成频率 - if self.batch_generation_interval > 0: - time.sleep(self.batch_generation_interval) + 生成样本过程中,需要 + 1. running 判断 + 2. 中断判断 + """ - except Exception as e: - logger.error(f"Generation loop error: {e}") - finally: - logger.info("Generation loop finished") + from verl.utils.tracking import Tracking - def fit(self): - """开始异步生成样本 - 改进的主运行逻辑""" + logger = Tracking( + project_name=self.config.trainer.project_name, + experiment_name=self.config.trainer.experiment_name, + default_backend=self.config.trainer.logger, + config=OmegaConf.to_container(self.config, resolve=True), + ) logger.info("Starting Rollouter...") - if self.message_queue_client is None: raise ValueError("MessageQueue client not set. Call set_message_queue_client() first.") - + if self.param_synchronizer is None: + raise ValueError("param_synchronizer client not set. Call set_parameter_synchronizer() first.") self.running = True # 在单独的线程中运行生成循环 - self.generation_thread = threading.Thread(target=self._generation_loop, daemon=True) - self.generation_thread.start() + self.report_thread = threading.Thread(target=self._report_loop, daemon=True) + self.report_thread.start() + + self.global_steps = 0 + + # load checkpoint before doing anything + self._load_checkpoint() + + # perform validation before training + # currently, we only support validation using the reward_function. + if self.val_reward_fn is not None and self.config.trainer.get("val_before_train", True): + val_metrics = self._validate() + assert val_metrics, f"{val_metrics=}" + pprint(f"Initial validation metrics: {val_metrics}") + logger.log(data=val_metrics, step=self.global_steps) + if self.config.trainer.get("val_only", False): + return + + # add tqdm + progress_bar = tqdm(total=self.total_training_steps, initial=self.global_steps, desc="Training Progress") + + # we start from step 1 + self.global_steps += 1 + last_val_metrics = None + self.max_steps_duration = 0 + + continuous_iterator = self._create_continuous_iterator() + for epoch, batch_dict in continuous_iterator: + if not self.running: + break + # 等待如果被暂停 + if not self.rollout_controller.wait_if_paused(timeout=1.0): + if not self.running: + break + + # 检查是否应该暂停生成 + self._should_pause_generation() + + metrics = {} + timing_raw = {} + batch, gen_batch = self._prepare_generate_batch(batch_dict) + is_last_step = self.global_steps >= self.total_training_steps - logger.info("Rollouter started successfully") + # generate a batch + with marked_timer("gen", timing_raw, color="red"): + if not self.async_rollout_mode: + gen_batch_output = self.actor_rollout_wg.generate_sequences(gen_batch) + else: + gen_batch_output = self.async_rollout_manager.generate_sequences(gen_batch) + timing_raw.update(gen_batch_output.meta_info["timing"]) + gen_batch_output.meta_info.pop("timing", None) + + if gen_batch_output is not None: + # 准备rollout metadata + rollout_metadata = { + "timing": timing_raw, + "generation_timestamp": time.time(), + "rollout_param_version": self.current_param_version, + "epoch": epoch, + } + # 放入队列 + success = self.message_queue_client.put_samples( + epoch=epoch, + sample=gen_batch_output, + param_version=self.current_param_version, + rollout_metadata=rollout_metadata, + ) + if success: + self.total_generated_samples += 1 + if self.total_generated_samples % 10 == 0: + logger.info( + f"Generated {self.total_generated_samples} batches, " + f"param_version={self.current_param_version}, " + f"errors={self.generation_errors}" + ) + else: + self.dropped_stale_samples += 1 + if self.dropped_stale_samples % 5 == 0: + logger.warning(f"Dropped stale samples: {self.dropped_stale_samples}") + def _report_loop(self): try: # 主线程保持运行,处理控制信号和状态监控 last_stats_time = time.time() diff --git a/recipe/fully_async_policy/fully_async_trainer.py b/recipe/fully_async_policy/fully_async_trainer.py index 16354313e4e..97567527b97 100644 --- a/recipe/fully_async_policy/fully_async_trainer.py +++ b/recipe/fully_async_policy/fully_async_trainer.py @@ -16,38 +16,25 @@ import time import warnings from pprint import pprint -from typing import Optional import numpy as np import ray -import torch from omegaconf import OmegaConf from torch.utils.data import Dataset, Sampler from tqdm import tqdm -from recipe.fully_async_policy.message_queue import QueueSample, MessageQueueClient -from verl import DataProto +from recipe.fully_async_policy.message_queue import MessageQueueClient, QueueSample from verl.single_controller.ray import RayClassWithInitArgs, RayWorkerGroup from verl.single_controller.ray.base import create_colocated_worker_cls from verl.trainer.ppo import core_algos -from verl.trainer.ppo.core_algos import AdvantageEstimator, agg_loss -from verl.trainer.ppo.metric_utils import ( - compute_data_metrics, - compute_throughout_metrics, - compute_timing_metrics, -) +from verl.trainer.ppo.core_algos import AdvantageEstimator from verl.trainer.ppo.ray_trainer import ( RayPPOTrainer, ResourcePoolManager, Role, WorkerType, - apply_kl_penalty, - compute_advantage, - compute_response_mask, ) -from verl.trainer.ppo.reward import compute_reward, compute_reward_async from verl.utils.debug import marked_timer -from verl.utils.metric import reduce_metrics from verl.utils.tracking import ValidationGenerationsLogger logger = logging.getLogger(__name__) @@ -70,10 +57,10 @@ def __init__( processor=None, reward_fn=None, val_reward_fn=None, - train_dataset: Optional[Dataset] = None, - val_dataset: Optional[Dataset] = None, + train_dataset: Dataset | None = None, + val_dataset: Dataset | None = None, collate_fn=None, - train_sampler: Optional[Sampler] = None, + train_sampler: Sampler | None = None, device_name=None, ): """ diff --git a/recipe/fully_async_policy/message_queue.py b/recipe/fully_async_policy/message_queue.py index 8e686e9a471..06f0d2cbbe9 100644 --- a/recipe/fully_async_policy/message_queue.py +++ b/recipe/fully_async_policy/message_queue.py @@ -18,7 +18,7 @@ import uuid from collections import deque from dataclasses import dataclass -from typing import Any, Optional, List +from typing import Any import ray from omegaconf import DictConfig @@ -77,7 +77,7 @@ def __init__(self, config: DictConfig, max_queue_size: int = 1000): ) def put_samples( - self, epoch: int, samples: List[Any], param_version: int, rollout_metadata_list: List[dict[str, Any]] = None + self, epoch: int, samples: list[Any], param_version: int, rollout_metadata_list: list[dict[str, Any]] = None ) -> bool: """ 放入一个batch样本到队列 @@ -107,7 +107,7 @@ def put_samples( logger.warning(f"len(rollout_metadata_list):{len(rollout_metadata_list)} != len(samples:{len(samples)}") return False - for sample, meta in zip(samples, rollout_metadata_list): + for sample, meta in zip(samples, rollout_metadata_list, strict=False): queue_sample = QueueSample( id=str(uuid.uuid4()), epoch=epoch, @@ -238,7 +238,7 @@ def __init__(self, queue_actor: Any): self.queue_actor = queue_actor def put_batch( - self, epoch: int, batch: List[Any], param_version: int, rollout_metadata_list: List[dict[str, Any]] = None + self, epoch: int, batch: list[Any], param_version: int, rollout_metadata_list: list[dict[str, Any]] = None ) -> bool: """放入batch到队列""" return ray.get(self.queue_actor.put_samples.remote(epoch, batch, param_version, rollout_metadata_list)) diff --git a/recipe/fully_async_policy/unittest/test_mq.py b/recipe/fully_async_policy/unittest/test_mq.py index 36172d02640..52a9f17d8ae 100644 --- a/recipe/fully_async_policy/unittest/test_mq.py +++ b/recipe/fully_async_policy/unittest/test_mq.py @@ -11,16 +11,16 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -import os import threading import time from unittest.mock import Mock import pytest import ray -from recipe.fully_async_policy.message_queue import QueueSample, MessageQueue, MessageQueueClient from omegaconf import DictConfig +from recipe.fully_async_policy.message_queue import MessageQueue, MessageQueueClient, QueueSample + @pytest.fixture def mock_data_proto(): diff --git a/recipe/one_step_off_policy/ray_trainer.py b/recipe/one_step_off_policy/ray_trainer.py index c5b7a71225e..893760965d0 100644 --- a/recipe/one_step_off_policy/ray_trainer.py +++ b/recipe/one_step_off_policy/ray_trainer.py @@ -76,20 +76,20 @@ class OneStepOffRayTrainer(RayPPOTrainer): # TODO: support each role have individual ray_worker_group_cls, # i.e., support different backend of different role def __init__( - self, - config, - tokenizer, - role_worker_mapping: dict[Role, WorkerType], - resource_pool_manager: ResourcePoolManager, - ray_worker_group_cls: RayWorkerGroup = RayWorkerGroup, - processor=None, - reward_fn=None, - val_reward_fn=None, - train_dataset: Dataset | None = None, - val_dataset: Dataset | None = None, - collate_fn=None, - train_sampler: Sampler | None = None, - device_name=None, + self, + config, + tokenizer, + role_worker_mapping: dict[Role, WorkerType], + resource_pool_manager: ResourcePoolManager, + ray_worker_group_cls: RayWorkerGroup = RayWorkerGroup, + processor=None, + reward_fn=None, + val_reward_fn=None, + train_dataset: Dataset | None = None, + val_dataset: Dataset | None = None, + collate_fn=None, + train_sampler: Sampler | None = None, + device_name=None, ): """ Initialize distributed PPO trainer with Ray backend. @@ -164,17 +164,14 @@ def _validate(self): def _create_actor_rollout_classes(self): # create actor and rollout - if not self.hybrid_engine: - for role in [Role.Actor, Role.Rollout]: - resource_pool = self.resource_pool_manager.get_resource_pool(role) - role_cls = RayClassWithInitArgs( - cls=self.role_worker_mapping[role], - config=self.config.actor_rollout_ref, - role=str(role), - ) - self.resource_pool_to_cls[resource_pool][str(role)] = role_cls - else: - raise NotImplementedError + for role in [Role.Actor, Role.Rollout]: + resource_pool = self.resource_pool_manager.get_resource_pool(role) + role_cls = RayClassWithInitArgs( + cls=self.role_worker_mapping[role], + config=self.config.actor_rollout_ref, + role=str(role), + ) + self.resource_pool_to_cls[resource_pool][str(role)] = role_cls def _init_models(self): if self.use_critic: From 8d8b99d42ce393f39ac5fbacd06b67befc2724f9 Mon Sep 17 00:00:00 2001 From: ArronHZG Date: Mon, 4 Aug 2025 17:32:48 +0800 Subject: [PATCH 017/182] add rollouter thread --- .../fully_async_rollouter.py | 461 ++++++++++-------- .../fully_async_policy/fully_async_trainer.py | 94 +--- 2 files changed, 284 insertions(+), 271 deletions(-) diff --git a/recipe/fully_async_policy/fully_async_rollouter.py b/recipe/fully_async_policy/fully_async_rollouter.py index f1248441594..0f4f624007e 100644 --- a/recipe/fully_async_policy/fully_async_rollouter.py +++ b/recipe/fully_async_policy/fully_async_rollouter.py @@ -15,6 +15,7 @@ import threading import time from concurrent.futures import ThreadPoolExecutor +from typing import Optional import ray from omegaconf import OmegaConf @@ -29,88 +30,6 @@ logger = logging.getLogger(__name__) - -class RolloutController: - """控制rollout的暂停和恢复 - 改进的控制机制""" - - def __init__(self): - self.is_paused = False - self.pause_event = threading.Event() - self.resume_event = threading.Event() - self.resume_event.set() # 初始状态为可运行 - self.pending_requests = [] - self.lock = threading.RLock() - self.pause_count = 0 - - def pause(self, timeout: float | None = None) -> bool: - """ - 暂停rollout - - Args: - timeout: 暂停超时时间,如果为None则无限等待 - - Returns: - bool: 是否成功暂停 - """ - with self.lock: - if not self.is_paused: - self.is_paused = True - self.resume_event.clear() - self.pause_event.set() - self.pause_count += 1 - logger.info(f"Rollout paused (count: {self.pause_count})") - return True - else: - logger.debug("Rollout already paused") - return True - - def resume(self) -> bool: - """ - 恢复rollout - - Returns: - bool: 是否成功恢复 - """ - with self.lock: - if self.is_paused: - self.is_paused = False - self.pause_event.clear() - self.resume_event.set() - logger.info("Rollout resumed") - return True - else: - logger.debug("Rollout already running") - return True - - def wait_if_paused(self, timeout: float = None) -> bool: - """ - 如果被暂停则等待恢复 - - Args: - timeout: 等待超时时间 - - Returns: - bool: 是否成功等待(未超时) - """ - if self.is_paused: - logger.debug(f"Waiting for resume (timeout: {timeout})") - return self.resume_event.wait(timeout) - return True - - def is_pause_requested(self) -> bool: - """检查是否有暂停请求""" - return self.pause_event.is_set() - - def get_status(self) -> dict: - """获取控制器状态""" - with self.lock: - return { - "is_paused": self.is_paused, - "pause_count": self.pause_count, - "has_pending_requests": len(self.pending_requests) > 0, - } - - @ray.remote(num_cpus=10, max_concurrency=10) class FullyAsyncRollouter(RayPPOTrainer): """ @@ -181,10 +100,10 @@ def __init__( self._validate_config() self._create_dataloader(train_dataset, val_dataset, collate_fn, train_sampler) + + # rollouter 参数配置 self.message_queue_client = None - # Rollout控制 - self.rollout_controller = RolloutController() self.current_param_version = 0 # 新鲜度控制 - 改进的配置管理 @@ -203,10 +122,19 @@ def __init__( self.rollout_wg = None self.message_queue_client = None - # 运行状态 + # 并发控制 self.running = False + self.paused = False self.generation_thread = None self.thread_executor = ThreadPoolExecutor(max_workers=2) + self.lock = threading.RLock() + self.condition = threading.Condition(self.lock) + + # 暂停/恢复统计信息 + self.pause_count = 0 + self.resume_count = 0 + self.total_pause_time = 0.0 + self.last_pause_time = None # 参数同步相关 self.param_synchronizer = None @@ -216,11 +144,13 @@ def __init__( def set_message_queue_client(self, message_queue_client: MessageQueueClient): """设置消息队列客户端""" - self.message_queue_client = message_queue_client + with self.lock: + self.message_queue_client = message_queue_client def set_parameter_synchronizer(self, param_synchronizer): """设置参数同步器""" - self.param_synchronizer = param_synchronizer + with self.lock: + self.param_synchronizer = param_synchronizer def _validate_config(self): # 验证异步训练配置 @@ -229,11 +159,11 @@ def _validate_config(self): def init_workers(self): """初始化rollout workers""" - logger.info("Initializing Rollouter workers...") - self._init_resource_pools() - - self.rollout_wg = all_wg["rollout"] - self.rollout_wg.init_model() + with self.lock: + logger.info("Initializing Rollouter workers...") + self._init_resource_pools() + self.rollout_wg = self.all_wg["rollout"] + self.rollout_wg.init_model() def _create_actor_rollout_classes(self): # only create rollout @@ -371,43 +301,43 @@ def _create_continuous_iterator(self): for batch_dict in iterator: yield epoch, batch_dict - def _should_pause_generation(self) -> bool: - """ - 判断是否应该暂停生成,基于新鲜度控制 - 改进的判断逻辑 - """ - if self.message_queue_client is None: - return False + def fit(self): + """开始异步生成样本 - 改进的主运行逻辑""" + from verl.utils.tracking import Tracking - try: - queue_stats = self.message_queue_client.get_statistics() - queue_size = queue_stats["queue_size"] - current_trainer_version = queue_stats["current_param_version"] + logger = Tracking( + project_name=self.config.trainer.project_name, + experiment_name=self.config.trainer.experiment_name, + default_backend=self.config.trainer.logger, + config=OmegaConf.to_container(self.config, resolve=True), + ) + logger.info("Starting Rollouter...") + if self.message_queue_client is None: + raise ValueError("MessageQueue client not set. Call set_message_queue_client() first.") + if self.param_synchronizer is None: + raise ValueError("param_synchronizer client not set. Call set_parameter_synchronizer() first.") - # 计算参数版本差异 - version_diff = self.current_param_version - current_trainer_version + # 设置运行状态 + with self.lock: + self.running = True + self.paused = False - # 如果版本差异过大,暂停生成 - if version_diff >= self.max_staleness_allowed: - logger.debug( - f"Should pause due to staleness: rollout_version={self.current_param_version}, " - f"trainer_version={current_trainer_version}, diff={version_diff}" - ) - return True + # 创建并启动生成线程 + self.generation_thread = threading.Thread(target=self._generation_loop, daemon=True) + self.generation_thread.start() - # 如果队列太满,也暂停生成 - max_queue_size = self.staleness_threshold * self.config.data.train_batch_size - if queue_size >= max_queue_size: - logger.debug(f"Should pause due to full queue: size={queue_size}, max={max_queue_size}") - return True + # 创建并启动监控线程 + self.monitor_thread = threading.Thread(target=self._monitor_loop, daemon=True) + self.monitor_thread.start() - return False + # 等待线程完成 + self.generation_thread.join() + self.monitor_thread.join() - except Exception as e: - logger.error(f"Error checking pause conditions: {e}") - return True # 出错时暂停生成 + logger.info("Rollouter fit completed") - def fit(self): - """开始异步生成样本 - 改进的主运行逻辑 + def _generation_loop(self): + """ 主要的生成循环 循环入口,需要 @@ -428,16 +358,6 @@ def fit(self): default_backend=self.config.trainer.logger, config=OmegaConf.to_container(self.config, resolve=True), ) - logger.info("Starting Rollouter...") - if self.message_queue_client is None: - raise ValueError("MessageQueue client not set. Call set_message_queue_client() first.") - if self.param_synchronizer is None: - raise ValueError("param_synchronizer client not set. Call set_parameter_synchronizer() first.") - self.running = True - - # 在单独的线程中运行生成循环 - self.report_thread = threading.Thread(target=self._report_loop, daemon=True) - self.report_thread.start() self.global_steps = 0 @@ -462,17 +382,40 @@ def fit(self): last_val_metrics = None self.max_steps_duration = 0 + """ + 主要的生成循环 + + 循环入口,需要 + 1. running 判断 + 4. 中断判断 + 3. 新鲜度判断 + + 生成样本过程中,需要 + 1. running 判断 + 2. 中断判断 + """ + continuous_iterator = self._create_continuous_iterator() for epoch, batch_dict in continuous_iterator: - if not self.running: - break - # 等待如果被暂停 - if not self.rollout_controller.wait_if_paused(timeout=1.0): + with self.lock: + if not self.running: + break + + # 如果被暂停,等待恢复 + while self.paused and self.running: + logger.debug("Generation thread paused, waiting...") + self.condition.wait() + + # 再次检查运行状态 if not self.running: break # 检查是否应该暂停生成 - self._should_pause_generation() + while True: + if self._should_pause_generation(): + with self.lock: + self.paused = True + logger.info("Generation paused due to staleness or queue size") metrics = {} timing_raw = {} @@ -503,18 +446,54 @@ def fit(self): param_version=self.current_param_version, rollout_metadata=rollout_metadata, ) - if success: - self.total_generated_samples += 1 - if self.total_generated_samples % 10 == 0: - logger.info( - f"Generated {self.total_generated_samples} batches, " - f"param_version={self.current_param_version}, " - f"errors={self.generation_errors}" - ) - else: - self.dropped_stale_samples += 1 - if self.dropped_stale_samples % 5 == 0: - logger.warning(f"Dropped stale samples: {self.dropped_stale_samples}") + + with self.lock: + if success: + self.total_generated_samples += 1 + if self.total_generated_samples % 10 == 0: + logger.info( + f"Generated {self.total_generated_samples} batches, " + f"param_version={self.current_param_version}, " + f"errors={self.generation_errors}" + ) + else: + self.dropped_stale_samples += 1 + if self.dropped_stale_samples % 5 == 0: + logger.warning(f"Dropped stale samples: {self.dropped_stale_samples}") + + def _monitor_loop(self): + """监控线程 - 监控状态并处理控制信号""" + try: + # 主线程保持运行,处理控制信号和状态监控 + last_stats_time = time.time() + stats_interval = 30.0 # 30秒报告一次统计 + check_interval = 5.0 # 5秒检查一次状态 + + while True: + with self.lock: + if not self.running: + break + + time.sleep(check_interval) + + # 定期打印统计信息 + current_time = time.time() + if current_time - last_stats_time >= stats_interval: + self._log_statistics() + last_stats_time = current_time + + # 检查是否应该恢复生成 + if self._should_resume_generation(): + with self.lock: + if self.paused: + self.paused = False + self.condition.notify_all() + logger.info("Generation resumed") + + except Exception as e: + logger.error(f"Error in monitor loop: {e}") + finally: + logger.info("Monitor thread exiting") def _report_loop(self): try: @@ -544,32 +523,116 @@ def _report_loop(self): finally: self.shutdown() - def _log_statistics(self): - """记录统计信息""" + + def _should_pause_generation(self) -> bool: + """ + 判断是否应该暂停生成,基于新鲜度控制 - 改进的判断逻辑 + """ try: - controller_status = self.rollout_controller.get_status() queue_stats = self.message_queue_client.get_statistics() + queue_size = queue_stats["queue_size"] + current_trainer_version = queue_stats["current_param_version"] + + # 计算参数版本差异 + version_diff = self.current_param_version - current_trainer_version + + # 如果版本差异过大,暂停生成 + if version_diff >= self.max_staleness_allowed: + logger.debug( + f"Should pause due to staleness: rollout_version={self.current_param_version}, " + f"trainer_version={current_trainer_version}, diff={version_diff}" + ) + return True + + # 如果队列太满,也暂停生成 + max_queue_size = self.staleness_threshold * self.config.data.train_batch_size + if queue_size >= max_queue_size: + logger.debug(f"Should pause due to full queue: size={queue_size}, max={max_queue_size}") + return True + + return False - logger.info( - f"Rollouter stats - Generated: {self.total_generated_samples}, " - f"Dropped: {self.dropped_stale_samples}, " - f"Errors: {self.generation_errors}, " - f"Queue size: {queue_stats['queue_size']}, " - f"Param version: {self.current_param_version}, " - f"Paused: {controller_status['is_paused']}, " - f"Sync requests: {self.param_sync_requests}" - ) except Exception as e: - logger.error(f"Error logging statistics: {e}") + logger.error(f"Error checking pause conditions: {e}") + return True # 出错时暂停生成 + + def _should_resume_generation(self) -> bool: + """判断是否应该恢复生成""" + if self.message_queue_client is None: + return False + + try: + with self.lock: + if not self.paused: + return False + + queue_stats = self.message_queue_client.get_statistics() + queue_size = queue_stats["queue_size"] + current_trainer_version = queue_stats["current_param_version"] + + # 计算参数版本差异 + version_diff = self.current_param_version - current_trainer_version + + # 如果版本差异减小,可以恢复生成 + if version_diff < self.max_staleness_allowed - 1: + logger.debug( + f"Can resume due to reduced staleness: rollout_version={self.current_param_version}, " + f"trainer_version={current_trainer_version}, diff={version_diff}" + ) + return True + + # 如果队列不太满,也可以恢复生成 + resume_queue_size = (self.staleness_threshold * self.config.data.train_batch_size) // 2 + if queue_size <= resume_queue_size: + logger.debug( + f"Can resume due to reduced queue: size={queue_size}, resume_threshold={resume_queue_size}") + return True + + return False + + except Exception as e: + logger.error(f"Error checking resume conditions: {e}") + return False + + def pause(self) -> bool: + """暂停生成 - 供外部调用""" + with self.lock: + if not self.running: + logger.warning("Cannot pause: not running") + return False + + if self.paused: + logger.debug("Already paused") + return True + + self.paused = True + logger.info("Generation paused") + return True + + def resume(self) -> bool: + """恢复生成 - 供外部调用""" + with self.lock: + if not self.running: + logger.warning("Cannot resume: not running") + return False + + if not self.paused: + logger.debug("Not paused") + return True + + self.paused = False + self.condition.notify_all() + logger.info("Generation resumed") + return True def shutdown(self): """关闭Rollouter - 改进的关闭逻辑""" logger.info("Shutting down Rollouter...") - self.running = False - - # 恢复可能被暂停的生成线程 - self.rollout_controller.resume() + with self.lock: + self.running = False + self.paused = False + self.condition.notify_all() # 等待生成线程结束 if self.generation_thread and self.generation_thread.is_alive(): @@ -579,6 +642,14 @@ def shutdown(self): if self.generation_thread.is_alive(): logger.warning("Generation thread did not finish within timeout") + # 等待监控线程结束 + if self.monitor_thread and self.monitor_thread.is_alive(): + logger.info("Waiting for monitor thread to finish...") + self.monitor_thread.join(timeout=5.0) + + if self.monitor_thread.is_alive(): + logger.warning("Monitor thread did not finish within timeout") + # 关闭线程池 if self.thread_executor: self.thread_executor.shutdown(wait=True) @@ -593,31 +664,35 @@ def shutdown(self): logger.info("Rollouter shutdown complete") - def get_statistics(self) -> dict: - """获取统计信息 - 改进的统计信息""" - controller_status = self.rollout_controller.get_status() - - stats = { - "total_generated_samples": self.total_generated_samples, - "dropped_stale_samples": self.dropped_stale_samples, - "generation_errors": self.generation_errors, - "current_param_version": self.current_param_version, - "param_sync_requests": self.param_sync_requests, - "last_sync_time": self.last_sync_time, - "is_running": self.running, - "sync_in_progress": self.sync_in_progress, - } - - stats.update(controller_status) - - # 添加队列统计(如果可用) - if self.message_queue_client: - try: - queue_stats = self.message_queue_client.get_statistics() - stats["queue_size"] = queue_stats.get("queue_size", 0) - stats["queue_total_produced"] = queue_stats.get("total_produced", 0) - stats["queue_dropped_samples"] = queue_stats.get("dropped_samples", 0) - except Exception as e: - logger.debug(f"Error getting queue statistics: {e}") - return stats + def _log_statistics(self): + """记录统计信息""" + try: + controller_status = self.rollout_controller.get_status() + queue_stats = self.message_queue_client.get_statistics() + + logger.info( + f"Rollouter stats - Generated: {self.total_generated_samples}, " + f"Dropped: {self.dropped_stale_samples}, " + f"Errors: {self.generation_errors}, " + f"Queue size: {queue_stats['queue_size']}, " + f"Param version: {self.current_param_version}, " + f"Paused: {controller_status['is_paused']}, " + f"Sync requests: {self.param_sync_requests}" + ) + except Exception as e: + logger.error(f"Error logging statistics: {e}") + + def get_statistics(self) -> dict: + with self.lock: + stats = { + "total_generated_samples": self.total_generated_samples, + "dropped_stale_samples": self.dropped_stale_samples, + "generation_errors": self.generation_errors, + "current_param_version": self.current_param_version, + "param_sync_requests": self.param_sync_requests, + "last_sync_time": self.last_sync_time, + "is_running": self.running, + "sync_in_progress": self.sync_in_progress, + } + return stats \ No newline at end of file diff --git a/recipe/fully_async_policy/fully_async_trainer.py b/recipe/fully_async_policy/fully_async_trainer.py index 97567527b97..0dd90127d7d 100644 --- a/recipe/fully_async_policy/fully_async_trainer.py +++ b/recipe/fully_async_policy/fully_async_trainer.py @@ -132,96 +132,34 @@ def set_message_queue_client(self, message_queue_client: MessageQueueClient): """设置消息队列客户端""" self.message_queue_client = message_queue_client - def _validate(self): - """执行验证 - 参考OneStepOffRayTrainer的验证逻辑""" - return None - - def init_workers(self): - """Initialize distributed training workers using Ray backend. - - Creates: - 1. Ray resource pools from configuration - 2. Worker groups for each role (actor, critic, etc.) - """ - self.resource_pool_manager.create_resource_pool() - - self.resource_pool_to_cls = {pool: {} for pool in self.resource_pool_manager.resource_pool_dict.values()} - - # 创建actor worker - resource_pool = self.resource_pool_manager.get_resource_pool(Role.Actor) - actor_cls = RayClassWithInitArgs( - cls=self.role_worker_mapping[Role.Actor], - config=self.config.actor_rollout_ref, - role="actor", - ) - self.resource_pool_to_cls[resource_pool]["actor"] = actor_cls - - # 创建critic worker - if self.use_critic: - resource_pool = self.resource_pool_manager.get_resource_pool(Role.Critic) - critic_cls = RayClassWithInitArgs(cls=self.role_worker_mapping[Role.Critic], config=self.config.critic) - self.resource_pool_to_cls[resource_pool]["critic"] = critic_cls - - # 创建reference policy worker - if self.use_reference_policy: - resource_pool = self.resource_pool_manager.get_resource_pool(Role.RefPolicy) - ref_policy_cls = RayClassWithInitArgs( - cls=self.role_worker_mapping[Role.RefPolicy], + def _create_actor_rollout_classes(self): + # create actor + for role in [Role.Actor]: + resource_pool = self.resource_pool_manager.get_resource_pool(role) + role_cls = RayClassWithInitArgs( + cls=self.role_worker_mapping[role], config=self.config.actor_rollout_ref, - role="ref", + role=str(role), ) - self.resource_pool_to_cls[resource_pool]["ref"] = ref_policy_cls - - # 创建reward model worker - if self.use_rm: - resource_pool = self.resource_pool_manager.get_resource_pool(Role.RewardModel) - rm_cls = RayClassWithInitArgs( - cls=self.role_worker_mapping[Role.RewardModel], config=self.config.reward_model - ) - self.resource_pool_to_cls[resource_pool]["rm"] = rm_cls - - # 初始化WorkerGroup - 参考OneStepOffRayTrainer的实现 - all_wg = {} - wg_kwargs = {} - if OmegaConf.select(self.config.trainer, "ray_wait_register_center_timeout") is not None: - wg_kwargs["ray_wait_register_center_timeout"] = self.config.trainer.ray_wait_register_center_timeout - if OmegaConf.select(self.config.trainer, "profile_steps") is not None: - wg_kwargs["profile_steps"] = OmegaConf.select(self.config.trainer, "profile_steps") - assert OmegaConf.select(self.config.trainer, "worker_nsight_options") is not None, ( - "worker_nsight_options must be set when profile_steps is set" - ) - wg_kwargs["worker_nsight_options"] = OmegaConf.to_container( - OmegaConf.select(self.config.trainer, "worker_nsight_options") - ) - - for resource_pool, class_dict in self.resource_pool_to_cls.items(): - worker_dict_cls = create_colocated_worker_cls(class_dict=class_dict) - wg_dict = self.ray_worker_group_cls( - resource_pool=resource_pool, - ray_cls_with_init=worker_dict_cls, - device_name=self.device_name, - **wg_kwargs, - ) - spawn_wg = wg_dict.spawn(prefix_set=class_dict.keys()) - all_wg.update(spawn_wg) - - # 分配worker groups - self.actor_wg = all_wg["actor"] - self.actor_wg.init_model() + self.resource_pool_to_cls[resource_pool][str(role)] = role_cls + def _init_models(self): if self.use_critic: - self.critic_wg = all_wg["critic"] + self.critic_wg = self.all_wg[str(Role.Critic)] self.critic_wg.init_model() if self.use_reference_policy and not self.ref_in_actor: - self.ref_policy_wg = all_wg["ref"] + self.ref_policy_wg = self.all_wg[str(Role.RefPolicy)] self.ref_policy_wg.init_model() if self.use_rm: - self.rm_wg = all_wg["rm"] + self.rm_wg = self.all_wg[str(Role.RewardModel)] self.rm_wg.init_model() - logger.info("FullyAsyncTrainer workers initialized successfully") + self.actor_wg = self.all_wg[str(Role.Actor)] + self.actor_wg.init_model() + self.actor_rollout_wg = self.actor_wg # to be compatible with the functions that not be modified + def fit(self): """ From ba8f1ce51ff7b8dee6eb911bbf68d6cbffb371b8 Mon Sep 17 00:00:00 2001 From: ArronHZG Date: Mon, 4 Aug 2025 18:56:59 +0800 Subject: [PATCH 018/182] lock --- .../fully_async_rollouter.py | 98 +++---------------- 1 file changed, 14 insertions(+), 84 deletions(-) diff --git a/recipe/fully_async_policy/fully_async_rollouter.py b/recipe/fully_async_policy/fully_async_rollouter.py index 0f4f624007e..6274237c6a8 100644 --- a/recipe/fully_async_policy/fully_async_rollouter.py +++ b/recipe/fully_async_policy/fully_async_rollouter.py @@ -30,6 +30,7 @@ logger = logging.getLogger(__name__) + @ray.remote(num_cpus=10, max_concurrency=10) class FullyAsyncRollouter(RayPPOTrainer): """ @@ -126,6 +127,7 @@ def __init__( self.running = False self.paused = False self.generation_thread = None + self.monitor_thread = None self.thread_executor = ThreadPoolExecutor(max_workers=2) self.lock = threading.RLock() self.condition = threading.Condition(self.lock) @@ -303,14 +305,6 @@ def _create_continuous_iterator(self): def fit(self): """开始异步生成样本 - 改进的主运行逻辑""" - from verl.utils.tracking import Tracking - - logger = Tracking( - project_name=self.config.trainer.project_name, - experiment_name=self.config.trainer.experiment_name, - default_backend=self.config.trainer.logger, - config=OmegaConf.to_container(self.config, resolve=True), - ) logger.info("Starting Rollouter...") if self.message_queue_client is None: raise ValueError("MessageQueue client not set. Call set_message_queue_client() first.") @@ -401,6 +395,9 @@ def _generation_loop(self): if not self.running: break + if self._should_pause_generation(): + self.pause() + # 如果被暂停,等待恢复 while self.paused and self.running: logger.debug("Generation thread paused, waiting...") @@ -410,13 +407,6 @@ def _generation_loop(self): if not self.running: break - # 检查是否应该暂停生成 - while True: - if self._should_pause_generation(): - with self.lock: - self.paused = True - logger.info("Generation paused due to staleness or queue size") - metrics = {} timing_raw = {} batch, gen_batch = self._prepare_generate_batch(batch_dict) @@ -499,7 +489,7 @@ def _report_loop(self): try: # 主线程保持运行,处理控制信号和状态监控 last_stats_time = time.time() - stats_interval = 30.0 # 30秒报告一次统计 + stats_interval = 10.0 while self.running: time.sleep(1.0) @@ -507,14 +497,16 @@ def _report_loop(self): # 定期打印统计信息 current_time = time.time() if current_time - last_stats_time >= stats_interval: - self._log_statistics() + self.get_statistics() last_stats_time = current_time + if not self._should_pause_generation(): + self.resume() # 检查生成线程状态 if not self.generation_thread.is_alive(): logger.error("Generation thread died, restarting...") - self.generation_thread = threading.Thread(target=self._generation_loop, daemon=True) - self.generation_thread.start() + raise RuntimeError("generation_thread not alive") + except KeyboardInterrupt: logger.info("Received interrupt signal, shutting down...") @@ -523,7 +515,6 @@ def _report_loop(self): finally: self.shutdown() - def _should_pause_generation(self) -> bool: """ 判断是否应该暂停生成,基于新鲜度控制 - 改进的判断逻辑 @@ -556,68 +547,25 @@ def _should_pause_generation(self) -> bool: logger.error(f"Error checking pause conditions: {e}") return True # 出错时暂停生成 - def _should_resume_generation(self) -> bool: - """判断是否应该恢复生成""" - if self.message_queue_client is None: - return False - - try: - with self.lock: - if not self.paused: - return False - - queue_stats = self.message_queue_client.get_statistics() - queue_size = queue_stats["queue_size"] - current_trainer_version = queue_stats["current_param_version"] - - # 计算参数版本差异 - version_diff = self.current_param_version - current_trainer_version - - # 如果版本差异减小,可以恢复生成 - if version_diff < self.max_staleness_allowed - 1: - logger.debug( - f"Can resume due to reduced staleness: rollout_version={self.current_param_version}, " - f"trainer_version={current_trainer_version}, diff={version_diff}" - ) - return True - - # 如果队列不太满,也可以恢复生成 - resume_queue_size = (self.staleness_threshold * self.config.data.train_batch_size) // 2 - if queue_size <= resume_queue_size: - logger.debug( - f"Can resume due to reduced queue: size={queue_size}, resume_threshold={resume_queue_size}") - return True - - return False - - except Exception as e: - logger.error(f"Error checking resume conditions: {e}") - return False - def pause(self) -> bool: """暂停生成 - 供外部调用""" with self.lock: if not self.running: - logger.warning("Cannot pause: not running") return False if self.paused: - logger.debug("Already paused") return True self.paused = True - logger.info("Generation paused") return True def resume(self) -> bool: """恢复生成 - 供外部调用""" with self.lock: if not self.running: - logger.warning("Cannot resume: not running") return False if not self.paused: - logger.debug("Not paused") return True self.paused = False @@ -664,35 +612,17 @@ def shutdown(self): logger.info("Rollouter shutdown complete") - - def _log_statistics(self): - """记录统计信息""" - try: - controller_status = self.rollout_controller.get_status() - queue_stats = self.message_queue_client.get_statistics() - - logger.info( - f"Rollouter stats - Generated: {self.total_generated_samples}, " - f"Dropped: {self.dropped_stale_samples}, " - f"Errors: {self.generation_errors}, " - f"Queue size: {queue_stats['queue_size']}, " - f"Param version: {self.current_param_version}, " - f"Paused: {controller_status['is_paused']}, " - f"Sync requests: {self.param_sync_requests}" - ) - except Exception as e: - logger.error(f"Error logging statistics: {e}") - def get_statistics(self) -> dict: with self.lock: + queue_stats = self.message_queue_client.get_statistics() stats = { "total_generated_samples": self.total_generated_samples, "dropped_stale_samples": self.dropped_stale_samples, - "generation_errors": self.generation_errors, "current_param_version": self.current_param_version, "param_sync_requests": self.param_sync_requests, "last_sync_time": self.last_sync_time, "is_running": self.running, "sync_in_progress": self.sync_in_progress, + "queue_size": f"{queue_stats['queue_size']}", } - return stats \ No newline at end of file + return stats From 8e5edeb8884b76134e574444657065cba147ee16 Mon Sep 17 00:00:00 2001 From: ArronHZG Date: Mon, 4 Aug 2025 19:22:45 +0800 Subject: [PATCH 019/182] test --- recipe/fully_async_policy/run_benchmark.sh | 307 ++++++++++++ .../unittest/test_fully_async_components.py | 444 ++++++++++++++++++ tests/special_e2e/run_fully_async_policy.sh | 196 ++++++++ 3 files changed, 947 insertions(+) create mode 100644 recipe/fully_async_policy/run_benchmark.sh create mode 100644 recipe/fully_async_policy/unittest/test_fully_async_components.py create mode 100644 tests/special_e2e/run_fully_async_policy.sh diff --git a/recipe/fully_async_policy/run_benchmark.sh b/recipe/fully_async_policy/run_benchmark.sh new file mode 100644 index 00000000000..f9bfaceaa32 --- /dev/null +++ b/recipe/fully_async_policy/run_benchmark.sh @@ -0,0 +1,307 @@ +#!/usr/bin/env bash +set -xeuo pipefail + +# Benchmark script for fully_async_policy performance testing +# This script runs various performance tests to evaluate the async training system + +NUM_GPUS=${NUM_GPUS:-8} +ACTOR_STRATEGY=${ACTOR_STRATEGY:-"fsdp2"} + +# Download model if not exists +MODEL_ID=${MODEL_ID:-Qwen/Qwen2.5-0.5B-Instruct} +MODEL_PATH=${MODEL_PATH:-${HOME}/models/${MODEL_ID}} +huggingface-cli download "${MODEL_ID}" --local-dir "${MODEL_PATH}" + +# Create benchmark results directory +BENCHMARK_DIR="benchmark_results_$(date +%Y%m%d_%H%M%S)" +mkdir -p "${BENCHMARK_DIR}" + +echo "Starting fully_async_policy performance benchmark..." +echo "Results will be saved to: ${BENCHMARK_DIR}" + +# Benchmark parameters +n_gpus_rollout=2 +n_gpus_training=$((NUM_GPUS - n_gpus_rollout)) + +# Common parameters +train_prompt_bsz=16 +n_resp_per_prompt=4 +train_prompt_mini_bsz=4 +max_prompt_length=512 +max_response_length=1024 + +# Benchmark Test 1: Different staleness thresholds +echo "=== Benchmark Test 1: Staleness Threshold Impact ===" +staleness_values=(1 3 5 10) + +for staleness in "${staleness_values[@]}"; do + echo "Testing staleness threshold: ${staleness}" + + exp_name="benchmark-staleness-${staleness}" + log_file="${BENCHMARK_DIR}/staleness_${staleness}.log" + + timeout 300 python3 -m recipe.fully_async_policy.fully_async_main \ + data.train_files="${HOME}/data/gsm8k/train.parquet" \ + data.val_files="${HOME}/data/gsm8k/test.parquet" \ + data.prompt_key=prompt \ + data.truncation='left' \ + data.max_prompt_length=${max_prompt_length} \ + data.max_response_length=${max_response_length} \ + data.train_batch_size=${train_prompt_bsz} \ + actor_rollout_ref.rollout.n=${n_resp_per_prompt} \ + actor_rollout_ref.model.path="${MODEL_PATH}" \ + trainer.logger=['console'] \ + trainer.project_name='verl-benchmark' \ + trainer.experiment_name="${exp_name}" \ + trainer.val_before_train=False \ + trainer.test_freq=-1 \ + trainer.save_freq=-1 \ + trainer.total_epochs=1 \ + trainer.total_training_steps=10 \ + trainer.n_gpus_per_node=${n_gpus_training} \ + rollout.n_gpus_per_node=${n_gpus_rollout} \ + async_training.staleness_threshold=${staleness} \ + async_training.max_staleness_allowed=$((staleness + 2)) \ + > "${log_file}" 2>&1 || echo "Test with staleness ${staleness} timed out or failed" + + # Extract key metrics from log + if [ -f "${log_file}" ]; then + echo "=== Metrics for staleness=${staleness} ===" >> "${BENCHMARK_DIR}/summary.txt" + grep -E "(Generated.*batches|Dropped.*samples|param_version|Queue size)" "${log_file}" | tail -5 >> "${BENCHMARK_DIR}/summary.txt" || true + echo "" >> "${BENCHMARK_DIR}/summary.txt" + fi +done + +# Benchmark Test 2: Different queue sizes +echo "=== Benchmark Test 2: Queue Size Impact ===" +queue_sizes=(50 100 500 1000) + +for queue_size in "${queue_sizes[@]}"; do + echo "Testing queue size: ${queue_size}" + + exp_name="benchmark-queue-${queue_size}" + log_file="${BENCHMARK_DIR}/queue_${queue_size}.log" + + timeout 300 python3 -m recipe.fully_async_policy.fully_async_main \ + data.train_files="${HOME}/data/gsm8k/train.parquet" \ + data.val_files="${HOME}/data/gsm8k/test.parquet" \ + data.prompt_key=prompt \ + data.truncation='left' \ + data.max_prompt_length=${max_prompt_length} \ + data.max_response_length=${max_response_length} \ + data.train_batch_size=${train_prompt_bsz} \ + actor_rollout_ref.rollout.n=${n_resp_per_prompt} \ + actor_rollout_ref.model.path="${MODEL_PATH}" \ + trainer.logger=['console'] \ + trainer.project_name='verl-benchmark' \ + trainer.experiment_name="${exp_name}" \ + trainer.val_before_train=False \ + trainer.test_freq=-1 \ + trainer.save_freq=-1 \ + trainer.total_epochs=1 \ + trainer.total_training_steps=10 \ + trainer.n_gpus_per_node=${n_gpus_training} \ + rollout.n_gpus_per_node=${n_gpus_rollout} \ + async_training.max_queue_size=${queue_size} \ + > "${log_file}" 2>&1 || echo "Test with queue size ${queue_size} timed out or failed" + + # Extract key metrics from log + if [ -f "${log_file}" ]; then + echo "=== Metrics for queue_size=${queue_size} ===" >> "${BENCHMARK_DIR}/summary.txt" + grep -E "(Generated.*batches|Queue size|memory)" "${log_file}" | tail -5 >> "${BENCHMARK_DIR}/summary.txt" || true + echo "" >> "${BENCHMARK_DIR}/summary.txt" + fi +done + +# Benchmark Test 3: Different batch generation intervals +echo "=== Benchmark Test 3: Generation Interval Impact ===" +intervals=(0.0 0.1 0.5 1.0) + +for interval in "${intervals[@]}"; do + echo "Testing batch generation interval: ${interval}s" + + exp_name="benchmark-interval-${interval}" + log_file="${BENCHMARK_DIR}/interval_${interval}.log" + + timeout 300 python3 -m recipe.fully_async_policy.fully_async_main \ + data.train_files="${HOME}/data/gsm8k/train.parquet" \ + data.val_files="${HOME}/data/gsm8k/test.parquet" \ + data.prompt_key=prompt \ + data.truncation='left' \ + data.max_prompt_length=${max_prompt_length} \ + data.max_response_length=${max_response_length} \ + data.train_batch_size=${train_prompt_bsz} \ + actor_rollout_ref.rollout.n=${n_resp_per_prompt} \ + actor_rollout_ref.model.path="${MODEL_PATH}" \ + trainer.logger=['console'] \ + trainer.project_name='verl-benchmark' \ + trainer.experiment_name="${exp_name}" \ + trainer.val_before_train=False \ + trainer.test_freq=-1 \ + trainer.save_freq=-1 \ + trainer.total_epochs=1 \ + trainer.total_training_steps=10 \ + trainer.n_gpus_per_node=${n_gpus_training} \ + rollout.n_gpus_per_node=${n_gpus_rollout} \ + async_training.batch_generation_interval=${interval} \ + > "${log_file}" 2>&1 || echo "Test with interval ${interval} timed out or failed" + + # Extract key metrics from log + if [ -f "${log_file}" ]; then + echo "=== Metrics for interval=${interval}s ===" >> "${BENCHMARK_DIR}/summary.txt" + grep -E "(Generated.*batches|generation_timestamp)" "${log_file}" | tail -5 >> "${BENCHMARK_DIR}/summary.txt" || true + echo "" >> "${BENCHMARK_DIR}/summary.txt" + fi +done + +# Benchmark Test 4: Resource allocation comparison +echo "=== Benchmark Test 4: Resource Allocation Comparison ===" + +# Test different rollout/training GPU distributions +if [ "${NUM_GPUS}" -ge "6" ]; then + gpu_configs=( + "1,$((NUM_GPUS - 1))" # 1 rollout, rest training + "2,$((NUM_GPUS - 2))" # 2 rollout, rest training + "3,$((NUM_GPUS - 3))" # 3 rollout, rest training + ) + + for config in "${gpu_configs[@]}"; do + IFS=',' read -r rollout_gpus training_gpus <<< "$config" + + echo "Testing GPU allocation: ${rollout_gpus} rollout, ${training_gpus} training" + + exp_name="benchmark-gpu-${rollout_gpus}r-${training_gpus}t" + log_file="${BENCHMARK_DIR}/gpu_${rollout_gpus}_${training_gpus}.log" + + timeout 300 python3 -m recipe.fully_async_policy.fully_async_main \ + data.train_files="${HOME}/data/gsm8k/train.parquet" \ + data.val_files="${HOME}/data/gsm8k/test.parquet" \ + data.prompt_key=prompt \ + data.truncation='left' \ + data.max_prompt_length=${max_prompt_length} \ + data.max_response_length=${max_response_length} \ + data.train_batch_size=${train_prompt_bsz} \ + actor_rollout_ref.rollout.n=${n_resp_per_prompt} \ + actor_rollout_ref.model.path="${MODEL_PATH}" \ + trainer.logger=['console'] \ + trainer.project_name='verl-benchmark' \ + trainer.experiment_name="${exp_name}" \ + trainer.val_before_train=False \ + trainer.test_freq=-1 \ + trainer.save_freq=-1 \ + trainer.total_epochs=1 \ + trainer.total_training_steps=10 \ + trainer.n_gpus_per_node=${training_gpus} \ + rollout.n_gpus_per_node=${rollout_gpus} \ + > "${log_file}" 2>&1 || echo "Test with GPU config ${config} timed out or failed" + + # Extract key metrics from log + if [ -f "${log_file}" ]; then + echo "=== Metrics for ${rollout_gpus}r/${training_gpus}t GPUs ===" >> "${BENCHMARK_DIR}/summary.txt" + grep -E "(Generated.*batches|training.*steps|GPU)" "${log_file}" | tail -5 >> "${BENCHMARK_DIR}/summary.txt" || true + echo "" >> "${BENCHMARK_DIR}/summary.txt" + fi + done +fi + +# Benchmark Test 5: Pause/Resume Performance +echo "=== Benchmark Test 5: Pause/Resume Performance Test ===" +log_file="${BENCHMARK_DIR}/pause_resume.log" + +# Start the training in background +python3 -m recipe.fully_async_policy.fully_async_main \ + data.train_files="${HOME}/data/gsm8k/train.parquet" \ + data.val_files="${HOME}/data/gsm8k/test.parquet" \ + data.prompt_key=prompt \ + data.truncation='left' \ + data.max_prompt_length=${max_prompt_length} \ + data.max_response_length=${max_response_length} \ + data.train_batch_size=${train_prompt_bsz} \ + actor_rollout_ref.rollout.n=${n_resp_per_prompt} \ + actor_rollout_ref.model.path="${MODEL_PATH}" \ + trainer.logger=['console'] \ + trainer.project_name='verl-benchmark-pause' \ + trainer.experiment_name='pause-resume-test' \ + trainer.val_before_train=False \ + trainer.test_freq=-1 \ + trainer.save_freq=-1 \ + trainer.total_epochs=1 \ + trainer.total_training_steps=20 \ + trainer.n_gpus_per_node=${n_gpus_training} \ + rollout.n_gpus_per_node=${n_gpus_rollout} \ + > "${log_file}" 2>&1 & + +TRAINING_PID=$! + +# Note: In actual implementation, we would need a way to remotely control pause/resume +# This is a placeholder for testing the pause/resume functionality +echo "Training started with PID: ${TRAINING_PID}" +echo "Pause/resume testing would require remote control interface" >> "${BENCHMARK_DIR}/summary.txt" + +# Wait a bit and then kill the training (simulating early termination) +sleep 60 +if kill -0 $TRAINING_PID 2>/dev/null; then + echo "Stopping training process..." + kill $TRAINING_PID +fi + +# Generate performance report +echo "=== Generating Performance Report ===" +report_file="${BENCHMARK_DIR}/performance_report.md" + +cat > "${report_file}" << EOF +# Fully Async Policy Performance Benchmark Report + +**Date:** $(date) +**Hardware:** ${NUM_GPUS} GPUs +**Strategy:** ${ACTOR_STRATEGY} +**Model:** ${MODEL_ID} + +## Test Configuration +- Training Batch Size: ${train_prompt_bsz} +- Responses per Prompt: ${n_resp_per_prompt} +- Max Prompt Length: ${max_prompt_length} +- Max Response Length: ${max_response_length} + +## Results Summary +$(cat "${BENCHMARK_DIR}/summary.txt" 2>/dev/null || echo "No summary available") + +## Log Files +EOF + +# List all log files +for log_file in "${BENCHMARK_DIR}"/*.log; do + if [ -f "$log_file" ]; then + echo "- $(basename "${log_file}")" >> "${report_file}" + fi +done + +cat >> "${report_file}" << EOF + +## Key Findings +- **Staleness Impact:** Lower staleness thresholds may increase sample dropping but improve freshness +- **Queue Size Impact:** Larger queues provide better buffering but use more memory +- **Generation Interval:** Shorter intervals increase throughput but may stress the system +- **GPU Allocation:** Balance between generation and training capacity is crucial +- **Pause/Resume:** System should handle interruptions gracefully + +## Recommendations +1. Start with staleness_threshold=3 for good balance +2. Use queue_size=500-1000 for most workloads +3. Set generation_interval=0.1s for good performance +4. Allocate 2-3 GPUs for rollout in typical 8-GPU setups +5. Monitor queue utilization and adjust based on workload + +EOF + +echo "Benchmark completed!" +echo "Results saved to: ${BENCHMARK_DIR}/" +echo "Performance report: ${report_file}" + +# Print summary to console +if [ -f "${BENCHMARK_DIR}/summary.txt" ]; then + echo "" + echo "=== BENCHMARK SUMMARY ===" + cat "${BENCHMARK_DIR}/summary.txt" +fi + diff --git a/recipe/fully_async_policy/unittest/test_fully_async_components.py b/recipe/fully_async_policy/unittest/test_fully_async_components.py new file mode 100644 index 00000000000..8e5279b84bb --- /dev/null +++ b/recipe/fully_async_policy/unittest/test_fully_async_components.py @@ -0,0 +1,444 @@ +# Copyright 2025 Meituan Ltd. and/or its affiliates +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" +单元测试文件,用于测试完全异步PPO训练系统的各个组件 +""" + +import os + +# Import components to test +import sys +import time +import unittest +from unittest.mock import Mock + +import ray +from omegaconf import OmegaConf + +sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) + +from fully_async_rollouter import FullyAsyncRollouter +from fully_async_trainer import FullyAsyncTrainer +from message_queue import MessageQueueClient +from param_sync import ParameterSynchronizer + + +class TestMessageQueue(unittest.TestCase): + """测试MessageQueue的功能""" + + def setUp(self): + """设置测试环境""" + if not ray.is_initialized(): + ray.init(ignore_reinit_error=True) + + # 创建MessageQueue客户端 + self.message_queue = MessageQueueClient.remote(max_queue_size=100, max_staleness=3) + + def tearDown(self): + """清理测试环境""" + if hasattr(self, "message_queue"): + ray.kill(self.message_queue) + + def test_put_and_get_samples(self): + """测试放入和获取样本的基本功能""" + # 创建模拟样本数据 + mock_sample = Mock() + mock_sample.batch_size = 4 + + # 测试放入样本 + success = ray.get( + self.message_queue.put_samples.remote( + epoch=1, sample=mock_sample, param_version=1, rollout_metadata={"timestamp": time.time()} + ) + ) + self.assertTrue(success) + + # 测试获取样本 + result = ray.get(self.message_queue.get_samples.remote(min_batch_count=1, timeout=5.0, current_param_version=1)) + + self.assertIsNotNone(result) + samples, metadata_list = result + self.assertEqual(len(samples), 1) + self.assertEqual(len(metadata_list), 1) + + def test_staleness_control(self): + """测试新鲜度控制功能""" + mock_sample = Mock() + mock_sample.batch_size = 4 + + # 放入一个参数版本较老的样本 + success = ray.get( + self.message_queue.put_samples.remote( + epoch=1, sample=mock_sample, param_version=1, rollout_metadata={"timestamp": time.time()} + ) + ) + self.assertTrue(success) + + # 尝试用较新的参数版本获取样本(应该被拒绝) + result = ray.get( + self.message_queue.get_samples.remote( + min_batch_count=1, + timeout=5.0, + current_param_version=5, # 版本差距为4 > max_staleness(3) + ) + ) + + # 应该返回空结果,因为样本过期 + self.assertIsNone(result) + + def test_queue_statistics(self): + """测试队列统计功能""" + # 获取初始统计 + stats = ray.get(self.message_queue.get_statistics.remote()) + initial_queue_size = stats["queue_size"] + + # 添加一些样本 + mock_sample = Mock() + mock_sample.batch_size = 4 + + for i in range(3): + ray.get( + self.message_queue.put_samples.remote( + epoch=1, sample=mock_sample, param_version=1, rollout_metadata={"timestamp": time.time()} + ) + ) + + # 检查统计是否更新 + stats = ray.get(self.message_queue.get_statistics.remote()) + self.assertEqual(stats["queue_size"], initial_queue_size + 3) + self.assertEqual(stats["total_produced"], 3) + + +class TestParameterSynchronizer(unittest.TestCase): + """测试参数同步器的功能""" + + def setUp(self): + """设置测试环境""" + if not ray.is_initialized(): + ray.init(ignore_reinit_error=True) + + self.config = OmegaConf.create( + {"async_training": {"max_sync_retries": 3, "sync_timeout": 10.0, "sync_retry_delay": 0.1}} + ) + + def test_sync_with_retry(self): + """测试带重试机制的参数同步""" + # 创建模拟的worker groups + mock_actor_wg = Mock() + mock_rollout_wg = Mock() + + # 模拟同步操作 + mock_actor_wg.get_weights.return_value = ray.put({"param1": "value1"}) + mock_rollout_wg.set_weights.return_value = [] + + synchronizer = ParameterSynchronizer.remote( + config=self.config, actor_wg=mock_actor_wg, rollout_wg=mock_rollout_wg + ) + + # 测试成功同步 + result = ray.get(synchronizer.sync_weights.remote()) + self.assertTrue(result) + + def test_sync_failure_and_retry(self): + """测试同步失败和重试机制""" + mock_actor_wg = Mock() + mock_rollout_wg = Mock() + + # 模拟同步失败 + mock_actor_wg.get_weights.side_effect = Exception("Sync failed") + + synchronizer = ParameterSynchronizer.remote( + config=self.config, actor_wg=mock_actor_wg, rollout_wg=mock_rollout_wg + ) + + # 测试失败时的重试 + result = ray.get(synchronizer.sync_weights.remote()) + self.assertFalse(result) + + +class TestFullyAsyncRollouter(unittest.TestCase): + """测试异步Rollouter的功能""" + + def setUp(self): + """设置测试环境""" + if not ray.is_initialized(): + ray.init(ignore_reinit_error=True) + + def test_pause_resume_functionality(self): + """测试暂停和恢复功能""" + # 创建配置 + config = OmegaConf.create( + { + "actor_rollout_ref": {"hybrid_engine": False, "model": {"lora_rank": 0}, "rollout": {"n": 2}}, + "algorithm": {"use_kl_in_reward": False}, + "critic": {"enable": False}, + "trainer": {"device": "cpu", "project_name": "test", "experiment_name": "test"}, + "async_training": { + "staleness_threshold": 3, + "max_staleness_allowed": 5, + "generation_timeout": 10.0, + "batch_generation_interval": 0.1, + }, + } + ) + + # 创建模拟的依赖 + mock_tokenizer = Mock() + mock_role_worker_mapping = Mock() + mock_resource_pool_manager = Mock() + + # 创建Rollouter实例 + rollouter = FullyAsyncRollouter.remote( + config=config, + tokenizer=mock_tokenizer, + role_worker_mapping=mock_role_worker_mapping, + resource_pool_manager=mock_resource_pool_manager, + ) + + # 测试暂停功能 + result = ray.get(rollouter.pause_rollout.remote()) + self.assertTrue(result) + + # 检查暂停状态 + is_paused = ray.get(rollouter.is_rollout_paused.remote()) + self.assertTrue(is_paused) + + # 测试恢复功能 + result = ray.get(rollouter.resume_rollout.remote()) + self.assertTrue(result) + + # 检查恢复状态 + is_paused = ray.get(rollouter.is_rollout_paused.remote()) + self.assertFalse(is_paused) + + def test_statistics_collection(self): + """测试统计信息收集功能""" + config = OmegaConf.create( + { + "actor_rollout_ref": {"hybrid_engine": False, "model": {"lora_rank": 0}, "rollout": {"n": 2}}, + "algorithm": {"use_kl_in_reward": False}, + "critic": {"enable": False}, + "trainer": {"device": "cpu", "project_name": "test", "experiment_name": "test"}, + "async_training": {"staleness_threshold": 3, "max_staleness_allowed": 5, "generation_timeout": 10.0}, + } + ) + + mock_tokenizer = Mock() + mock_role_worker_mapping = Mock() + mock_resource_pool_manager = Mock() + + rollouter = FullyAsyncRollouter.remote( + config=config, + tokenizer=mock_tokenizer, + role_worker_mapping=mock_role_worker_mapping, + resource_pool_manager=mock_resource_pool_manager, + ) + + # 获取统计信息 + stats = ray.get(rollouter.get_statistics.remote()) + + # 验证统计信息包含必要的字段 + expected_keys = [ + "total_generated_samples", + "dropped_stale_samples", + "generation_errors", + "current_param_version", + "is_paused", + "pause_count", + "resume_count", + ] + + for key in expected_keys: + self.assertIn(key, stats) + + +class TestFullyAsyncTrainer(unittest.TestCase): + """测试异步Trainer的功能""" + + def setUp(self): + """设置测试环境""" + if not ray.is_initialized(): + ray.init(ignore_reinit_error=True) + + def test_freshness_metrics_calculation(self): + """测试新鲜度指标计算""" + # 创建基本配置 + config = OmegaConf.create( + { + "trainer": { + "device": "cpu", + "project_name": "test", + "experiment_name": "test", + "total_epochs": 1, + "total_training_steps": 2, + }, + "async_training": {"staleness_threshold": 3, "max_staleness_allowed": 5, "batch_timeout": 10.0}, + "data": {"train_batch_size": 4}, + "actor_rollout_ref": {"hybrid_engine": False, "model": {"lora_rank": 0}}, + "algorithm": {"use_kl_in_reward": False}, + "critic": {"enable": False}, + } + ) + + # 创建模拟的依赖 + mock_tokenizer = Mock() + mock_role_worker_mapping = Mock() + mock_resource_pool_manager = Mock() + + trainer = FullyAsyncTrainer.remote( + config=config, + tokenizer=mock_tokenizer, + role_worker_mapping=mock_role_worker_mapping, + resource_pool_manager=mock_resource_pool_manager, + ) + + # 测试新鲜度指标计算 + current_time = time.time() + metadata_list = [ + {"generation_timestamp": current_time - 5, "rollout_param_version": 1}, + {"generation_timestamp": current_time - 10, "rollout_param_version": 2}, + {"generation_timestamp": current_time - 15, "rollout_param_version": 1}, + ] + + freshness_metrics = ray.get(trainer._calculate_freshness_metrics.remote(metadata_list, current_param_version=3)) + + # 验证新鲜度指标 + self.assertIn("avg_sample_age", freshness_metrics) + self.assertIn("max_sample_age", freshness_metrics) + self.assertIn("min_sample_age", freshness_metrics) + self.assertIn("version_diversity", freshness_metrics) + self.assertIn("staleness_ratio", freshness_metrics) + + +class TestIntegrationScenarios(unittest.TestCase): + """测试组件集成场景""" + + def setUp(self): + """设置测试环境""" + if not ray.is_initialized(): + ray.init(ignore_reinit_error=True) + + def test_message_queue_trainer_integration(self): + """测试MessageQueue与Trainer的集成""" + # 创建MessageQueue + message_queue = MessageQueueClient.remote(max_queue_size=10, max_staleness=3) + + # 放入一些测试样本 + mock_sample = Mock() + mock_sample.batch_size = 4 + + ray.get( + message_queue.put_samples.remote( + epoch=1, sample=mock_sample, param_version=1, rollout_metadata={"timestamp": time.time()} + ) + ) + + # 验证Trainer能够获取样本 + result = ray.get(message_queue.get_samples.remote(min_batch_count=1, timeout=5.0, current_param_version=1)) + + self.assertIsNotNone(result) + samples, metadata_list = result + self.assertEqual(len(samples), 1) + + def test_rollouter_message_queue_integration(self): + """测试Rollouter与MessageQueue的集成""" + # 这个测试需要更多的模拟设置,因为涉及到实际的模型生成 + # 在实际实现中,可以使用更多的Mock对象来模拟这种集成 + pass + + +class TestErrorHandling(unittest.TestCase): + """测试错误处理和边界情况""" + + def setUp(self): + """设置测试环境""" + if not ray.is_initialized(): + ray.init(ignore_reinit_error=True) + + def test_message_queue_overflow(self): + """测试消息队列溢出处理""" + # 创建小容量的队列 + message_queue = MessageQueueClient.remote(max_queue_size=2, max_staleness=3) + + mock_sample = Mock() + mock_sample.batch_size = 4 + + # 填满队列 + for i in range(2): + result = ray.get( + message_queue.put_samples.remote( + epoch=1, sample=mock_sample, param_version=1, rollout_metadata={"timestamp": time.time()} + ) + ) + self.assertTrue(result) + + # 尝试再放入一个样本(应该失败或者覆盖旧样本) + result = ray.get( + message_queue.put_samples.remote( + epoch=1, sample=mock_sample, param_version=1, rollout_metadata={"timestamp": time.time()} + ) + ) + + # 根据实现,这里可能是False(拒绝)或True(覆盖) + self.assertIsInstance(result, bool) + + def test_timeout_handling(self): + """测试超时处理""" + message_queue = MessageQueueClient.remote(max_queue_size=10, max_staleness=3) + + # 尝试从空队列获取样本,应该超时 + start_time = time.time() + result = ray.get( + message_queue.get_samples.remote( + min_batch_count=1, + timeout=1.0, # 1秒超时 + current_param_version=1, + ) + ) + elapsed = time.time() - start_time + + # 应该返回None并且大约在1秒后返回 + self.assertIsNone(result) + self.assertGreater(elapsed, 0.9) # 允许一些误差 + self.assertLess(elapsed, 2.0) + + +if __name__ == "__main__": + # 设置测试套件 + test_suite = unittest.TestSuite() + + # 添加测试用例 + test_classes = [ + TestMessageQueue, + TestParameterSynchronizer, + TestFullyAsyncRollouter, + TestFullyAsyncTrainer, + TestIntegrationScenarios, + TestErrorHandling, + ] + + for test_class in test_classes: + tests = unittest.TestLoader().loadTestsFromTestCase(test_class) + test_suite.addTests(tests) + + # 运行测试 + runner = unittest.TextTestRunner(verbosity=2) + result = runner.run(test_suite) + + # 清理Ray + if ray.is_initialized(): + ray.shutdown() + + # 退出 + exit(0 if result.wasSuccessful() else 1) diff --git a/tests/special_e2e/run_fully_async_policy.sh b/tests/special_e2e/run_fully_async_policy.sh new file mode 100644 index 00000000000..9692aab0d44 --- /dev/null +++ b/tests/special_e2e/run_fully_async_policy.sh @@ -0,0 +1,196 @@ +#!/usr/bin/env bash +set -xeuo pipefail + +# Test script for fully_async_policy E2E regression testing +# This script runs fully async PPO training with both FSDP2 and Megatron backends +# to ensure the asynchronous training mechanism works correctly + +NUM_GPUS=${NUM_GPUS:-8} +ACTOR_STRATEGY=${ACTOR_STRATEGY:-"fsdp2"} # fsdp2 or megatron + +# Download model if not exists +MODEL_ID=${MODEL_ID:-Qwen/Qwen2.5-0.5B-Instruct} +MODEL_PATH=${MODEL_PATH:-${HOME}/models/${MODEL_ID}} +huggingface-cli download "${MODEL_ID}" --local-dir "${MODEL_PATH}" + +# Algorithm parameters +adv_estimator=grpo + +use_kl_in_reward=False +kl_coef=0.0 +use_kl_loss=False +kl_loss_coef=0.0 + +clip_ratio_low=0.2 +clip_ratio_high=0.28 + +# Response length parameters +max_prompt_length=1024 +max_response_length=2048 +enable_overlong_buffer=True +overlong_buffer_len=128 +overlong_penalty_factor=1.0 + +# Training parameters +loss_agg_mode="token-mean" +train_prompt_bsz=8 +n_resp_per_prompt=3 +train_prompt_mini_bsz=4 + +# Temperature parameters +temperature=1.0 +top_p=1.0 +top_k=-1 +val_top_p=0.7 + +# Fully async specific parameters +# Allocate 2 GPUs for rollout, remaining for training +n_gpus_rollout=2 +n_gpus_training=$((NUM_GPUS - n_gpus_rollout)) + +# Async training specific configurations +staleness_threshold=3 +max_staleness_allowed=5 +max_queue_size=1000 +min_batch_count=1 +batch_timeout=30.0 +generation_timeout=30.0 +batch_generation_interval=0.1 +max_sync_retries=3 +sync_timeout=30.0 +sync_retry_delay=1.0 + +exp_name="$(basename "${MODEL_ID,,}")-fully-async-policy-${ACTOR_STRATEGY}-minimal" + +echo "Running fully_async_policy with ${ACTOR_STRATEGY} strategy" +echo "Total GPUs: ${NUM_GPUS}, Rollout GPUs: ${n_gpus_rollout}, Training GPUs: ${n_gpus_training}" + +# Common parameters for both FSDP2 and Megatron +common_params=( + data.train_files="${HOME}/data/gsm8k/train.parquet" + data.val_files="${HOME}/data/gsm8k/test.parquet" + data.prompt_key=prompt + data.truncation='left' + data.max_prompt_length=${max_prompt_length} + data.max_response_length=${max_response_length} + data.train_batch_size=${train_prompt_bsz} + actor_rollout_ref.rollout.n=${n_resp_per_prompt} + algorithm.adv_estimator=${adv_estimator} + algorithm.use_kl_in_reward=${use_kl_in_reward} + algorithm.kl_ctrl.kl_coef=${kl_coef} + actor_rollout_ref.hybrid_engine=False + actor_rollout_ref.actor.use_kl_loss=${use_kl_loss} + actor_rollout_ref.actor.kl_loss_coef=${kl_loss_coef} + actor_rollout_ref.actor.clip_ratio_low=${clip_ratio_low} + actor_rollout_ref.actor.clip_ratio_high=${clip_ratio_high} + actor_rollout_ref.actor.clip_ratio_c=10.0 + actor_rollout_ref.model.path="${MODEL_PATH}" + actor_rollout_ref.model.enable_gradient_checkpointing=True + actor_rollout_ref.actor.optim.lr=1e-6 + actor_rollout_ref.actor.optim.lr_warmup_steps=-1 + actor_rollout_ref.actor.optim.weight_decay=0.1 + actor_rollout_ref.actor.ppo_mini_batch_size=${train_prompt_mini_bsz} + actor_rollout_ref.actor.entropy_coeff=0 + actor_rollout_ref.actor.loss_agg_mode=${loss_agg_mode} + actor_rollout_ref.rollout.gpu_memory_utilization=0.80 + actor_rollout_ref.rollout.temperature=${temperature} + actor_rollout_ref.rollout.top_p=${top_p} + actor_rollout_ref.rollout.top_k=${top_k} + actor_rollout_ref.rollout.val_kwargs.temperature=${temperature} + actor_rollout_ref.rollout.val_kwargs.top_p=${val_top_p} + actor_rollout_ref.rollout.val_kwargs.top_k=${top_k} + actor_rollout_ref.rollout.val_kwargs.do_sample=True + actor_rollout_ref.rollout.val_kwargs.n=1 + actor_rollout_ref.rollout.enable_chunked_prefill=True + reward_model.reward_manager=dapo + +reward_model.reward_kwargs.overlong_buffer_cfg.enable=${enable_overlong_buffer} + +reward_model.reward_kwargs.overlong_buffer_cfg.len=${overlong_buffer_len} + +reward_model.reward_kwargs.overlong_buffer_cfg.penalty_factor=${overlong_penalty_factor} + +reward_model.reward_kwargs.overlong_buffer_cfg.log=False + +reward_model.reward_kwargs.max_resp_len=${max_response_length} + trainer.logger=['console'] + trainer.project_name='verl-test-fully-async' + trainer.experiment_name="${exp_name}" + trainer.val_before_train=False + trainer.test_freq=-1 + trainer.save_freq=-1 + trainer.total_epochs=2 + trainer.total_training_steps=4 + trainer.resume_mode=disable + trainer.nnodes=1 + trainer.n_gpus_per_node=${n_gpus_training} + rollout.nnodes=1 + rollout.n_gpus_per_node=${n_gpus_rollout} + # Fully async specific configurations + async_training.staleness_threshold=${staleness_threshold} + async_training.max_staleness_allowed=${max_staleness_allowed} + async_training.max_queue_size=${max_queue_size} + async_training.min_batch_count=${min_batch_count} + async_training.batch_timeout=${batch_timeout} + async_training.generation_timeout=${generation_timeout} + async_training.batch_generation_interval=${batch_generation_interval} + async_training.max_sync_retries=${max_sync_retries} + async_training.sync_timeout=${sync_timeout} + async_training.sync_retry_delay=${sync_retry_delay} +) + +if [ "${ACTOR_STRATEGY}" == "fsdp2" ]; then + echo "Running fully async training with FSDP2 strategy..." + # FSDP2 specific parameters + gen_tp=2 + sp_size=2 + fsdp_size=2 + ref_offload=True + actor_offload=False + + python3 -m recipe.fully_async_policy.fully_async_main \ + "${common_params[@]}" \ + actor_rollout_ref.actor.strategy=fsdp2 \ + critic.strategy=fsdp2 \ + actor_rollout_ref.actor.grad_clip=1.0 \ + actor_rollout_ref.model.use_remove_padding=True \ + actor_rollout_ref.actor.use_dynamic_bsz=True \ + actor_rollout_ref.ref.log_prob_use_dynamic_bsz=True \ + actor_rollout_ref.rollout.log_prob_use_dynamic_bsz=True \ + actor_rollout_ref.actor.fsdp_config.param_offload=${actor_offload} \ + actor_rollout_ref.actor.fsdp_config.optimizer_offload=${actor_offload} \ + actor_rollout_ref.actor.ulysses_sequence_parallel_size=${sp_size} \ + actor_rollout_ref.rollout.tensor_model_parallel_size=${gen_tp} \ + actor_rollout_ref.ref.fsdp_config.param_offload=${ref_offload} \ + actor_rollout_ref.ref.ulysses_sequence_parallel_size=${sp_size} \ + actor_rollout_ref.actor.fsdp_config.fsdp_size=${fsdp_size} $@ + +elif [ "${ACTOR_STRATEGY}" == "megatron" ]; then + echo "Running fully async training with Megatron strategy..." + # Megatron specific parameters + gen_tp=2 + train_tp=1 + train_pp=2 + ref_offload=True + actor_offload=False + + python3 -m recipe.fully_async_policy.fully_async_main \ + --config-path=config \ + --config-name='fully_async_ppo_megatron_trainer.yaml' \ + "${common_params[@]}" \ + actor_rollout_ref.actor.strategy=megatron \ + critic.strategy=megatron \ + actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=2 \ + actor_rollout_ref.ref.log_prob_micro_batch_size_per_gpu=4 \ + actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu=4 \ + actor_rollout_ref.actor.megatron.param_offload=${actor_offload} \ + actor_rollout_ref.actor.megatron.optimizer_offload=${actor_offload} \ + actor_rollout_ref.actor.megatron.grad_offload=${actor_offload} \ + actor_rollout_ref.actor.megatron.pipeline_model_parallel_size=${train_pp} \ + actor_rollout_ref.actor.megatron.tensor_model_parallel_size=${train_tp} \ + actor_rollout_ref.rollout.tensor_model_parallel_size=${gen_tp} \ + actor_rollout_ref.ref.megatron.pipeline_model_parallel_size=${train_pp} \ + actor_rollout_ref.ref.megatron.tensor_model_parallel_size=${train_tp} \ + actor_rollout_ref.ref.megatron.param_offload=${ref_offload} $@ +else + echo "Error: Unknown strategy ${ACTOR_STRATEGY}. Please use 'fsdp2' or 'megatron'" + exit 1 +fi + +echo "Fully async policy E2E test completed successfully with ${ACTOR_STRATEGY} strategy" + From 941c3dea8ccb39981ef53f17a5a9bb117f67b702 Mon Sep 17 00:00:00 2001 From: ArronHZG Date: Tue, 5 Aug 2025 11:53:26 +0800 Subject: [PATCH 020/182] init models --- recipe/fully_async_policy/fully_async_main.py | 159 ++++++------------ .../fully_async_rollouter.py | 108 ++++++------ .../fully_async_policy/fully_async_trainer.py | 5 - 3 files changed, 100 insertions(+), 172 deletions(-) diff --git a/recipe/fully_async_policy/fully_async_main.py b/recipe/fully_async_policy/fully_async_main.py index e8053e74647..54460b3611d 100644 --- a/recipe/fully_async_policy/fully_async_main.py +++ b/recipe/fully_async_policy/fully_async_main.py @@ -32,17 +32,6 @@ from verl.trainer.ppo.reward import load_reward_manager from verl.utils.fs import copy_to_local -logger = logging.getLogger(__name__) - - -def setup_logging(): - """设置日志配置""" - logging.basicConfig( - level=logging.INFO, - format="%(asctime)s - %(name)s - %(levelname)s - %(message)s", - handlers=[logging.StreamHandler(), logging.FileHandler("fully_async_training.log")], - ) - def create_resource_pool_manager(config, roles: list) -> ResourcePoolManager: """ @@ -81,9 +70,6 @@ def create_resource_pool_manager(config, roles: list) -> ResourcePoolManager: resource_pool_spec["rollout_pool"] = rollout_pool mapping[Role.Rollout] = "rollout_pool" - logger.info(f"Resource pool specification: {resource_pool_spec}") - logger.info(f"Role mapping: {mapping}") - return ResourcePoolManager(resource_pool_spec=resource_pool_spec, mapping=mapping) @@ -168,22 +154,22 @@ def __init__(self): def run(self, config): """运行完全异步的PPO训练""" - setup_logging() - logger.info("Starting fully async PPO training...") + print("Starting fully async PPO training...") # 设置信号处理 self._setup_signal_handlers() # 初始化基础组件 self._initialize_components(config) + time.sleep(60) # 启动训练流程 - self._run_training_loop() + # self._run_training_loop() - self._cleanup_resources() + # self._cleanup_resources() def _setup_signal_handlers(self): """设置信号处理器""" def signal_handler(signum, frame): - logger.info(f"Received signal {signum}, initiating shutdown...") + print(f"Received signal {signum}, initiating shutdown...") self.running = False self.shutdown_event.set() @@ -206,7 +192,7 @@ def _initialize_components(self, config) -> None: OmegaConf.resolve(config) # 初始化模型路径和tokenizer - logger.info("Initializing model and tokenizer...") + print("Initializing model and tokenizer...") local_path = copy_to_local( config.actor_rollout_ref.model.path, use_shm=config.actor_rollout_ref.model.get("use_shm", False) ) @@ -222,27 +208,13 @@ def _initialize_components(self, config) -> None: self.components["processor"] = processor # 创建worker映射和资源池 - logger.info("Creating worker mapping and resource pools...") + print("Creating worker mapping and resource pools...") role_worker_mapping, ray_worker_group_cls = create_role_worker_mapping(config) self.components["role_worker_mapping"] = role_worker_mapping self.components["ray_worker_group_cls"] = ray_worker_group_cls - # 创建数据集 - logger.info("Creating datasets...") - from verl.trainer.main_ppo import create_rl_dataset, create_rl_sampler - from verl.utils.dataset.rl_dataset import collate_fn - - train_dataset = create_rl_dataset(config.data.train_files, config.data, tokenizer, processor) - val_dataset = create_rl_dataset(config.data.val_files, config.data, tokenizer, processor) - train_sampler = create_rl_sampler(config.data, train_dataset) - - self.components["train_dataset"] = train_dataset - self.components["val_dataset"] = val_dataset - self.components["train_sampler"] = train_sampler - self.components["collate_fn"] = collate_fn - # 创建奖励函数 - logger.info("Loading reward functions...") + print("Loading reward functions...") reward_fn = load_reward_manager( config, tokenizer, num_examine=0, **config.reward_model.get("reward_kwargs", {}) ) @@ -253,7 +225,7 @@ def _initialize_components(self, config) -> None: self.components["val_reward_fn"] = val_reward_fn # 创建MessageQueue - logger.info("Creating MessageQueue...") + print("Creating MessageQueue...") max_queue_size = config.async_training.get("max_queue_size", 1000) message_queue = MessageQueue.remote(config, max_queue_size) message_queue_client = MessageQueueClient(message_queue) @@ -262,25 +234,26 @@ def _initialize_components(self, config) -> None: self.components["message_queue_client"] = message_queue_client # 创建Rollouter - logger.info("Creating Rollouter...") + print("Creating Rollouter...") self._create_rollouter(config) # 创建Trainer - logger.info("Creating FullyAsyncTrainer...") + print("Creating FullyAsyncTrainer...") self._create_trainer(config) # 设置参数同步 - logger.info("Setting up parameter synchronization...") - param_synchronizer = AsyncParameterSynchronizer( - config=config, - actor_wg=self.components["trainer"].actor_wg, - rollouter=self.components["rollouter"], - ) - self.components["param_synchronizer"] = param_synchronizer - logger.info("All components initialized successfully") + # print("Setting up parameter synchronization...") + # param_synchronizer = AsyncParameterSynchronizer( + # config=config, + # actor_wg=self.components["trainer"].actor_wg, + # rollouter=self.components["rollouter"], + # ) + # self.components["param_synchronizer"] = param_synchronizer + # print("All components initialized successfully") def _create_rollouter(self, config) -> None: """创建Rollouter""" + pprint(self.components) rollouter = FullyAsyncRollouter.remote( config=config, tokenizer=self.components["tokenizer"], @@ -288,21 +261,19 @@ def _create_rollouter(self, config) -> None: resource_pool_manager=create_resource_pool_manager(config, roles=[Role.Rollout]), ray_worker_group_cls=self.components["ray_worker_group_cls"], processor=self.components["processor"], - train_dataset=self.components["train_dataset"], - collate_fn=self.components["collate_fn"], - train_sampler=self.components["train_sampler"], device_name=config.trainer.device, ) + print(rollouter) + + print("========== rollouter init workers ======") # 初始化Rollouter - init_future = rollouter.init_workers.remote() - ray.get(init_future, timeout=60.0) + ray.get(rollouter.init_workers.remote()) - set_queue_future = rollouter.set_message_queue_client.remote(self.components["message_queue_client"]) - ray.get(set_queue_future, timeout=10.0) + ray.get(rollouter.set_message_queue_client.remote(self.components["message_queue_client"])) self.components["rollouter"] = rollouter - logger.info("Rollouter created and initialized successfully") + print("Rollouter created and initialized successfully") def _create_trainer(self, config) -> None: """创建Trainer""" @@ -322,39 +293,33 @@ def _create_trainer(self, config) -> None: processor=self.components["processor"], reward_fn=self.components["reward_fn"], val_reward_fn=self.components["val_reward_fn"], - train_dataset=self.components["train_dataset"], - val_dataset=self.components["val_dataset"], - collate_fn=self.components["collate_fn"], - train_sampler=self.components["train_sampler"], device_name=config.trainer.device, ) # 初始化Trainer - trainer.init_workers() - trainer.set_message_queue_client(self.components["message_queue_client"]) - trainer.set_rollouter(self.components["rollouter"]) - + ray.get(trainer.init_workers.remote()) + ray.get(trainer.set_message_queue_client.remote(self.components["message_queue_client"])) self.components["trainer"] = trainer - logger.info("FullyAsyncTrainer created and initialized successfully") + print("FullyAsyncTrainer created and initialized successfully") def _run_training_loop(self): """运行训练循环""" self.running = True - logger.info("Starting Rollouter in background...") + print("Starting Rollouter in background...") rollouter_future = self.components["rollouter"].fit.remote() trainer_future = self.components["trainer"].fit.remote() self._monitor_components() ray.get(rollouter_future) ray.get(trainer_future) - logger.info("Training completed or interrupted") + print("Training completed or interrupted") def _run_rollouter(self): try: ray.get(self.components["rollouter"].fit.remote()) except Exception as e: - logger.error(f"Rollouter error: {e}") + print(f"Rollouter error: {e}") self.running = False self.shutdown_event.set() @@ -363,14 +328,14 @@ def _run_trainer(self): try: self.components["trainer"].fit() except Exception as e: - logger.error(f"Trainer error: {e}") + print(f"Trainer error: {e}") finally: self.running = False self.shutdown_event.set() def _monitor_components(self): """监控组件状态""" - logger.info("Starting component monitoring...") + print("Starting component monitoring...") last_stats_time = time.time() stats_interval = 60.0 # 60秒报告一次统计 @@ -391,9 +356,9 @@ def _monitor_components(self): self._check_component_health() except Exception as e: - logger.error(f"Error in component monitoring: {e}") + print(f"Error in component monitoring: {e}") - logger.info("Component monitoring stopped") + print("Component monitoring stopped") def _log_component_statistics(self): """记录组件统计信息""" @@ -407,27 +372,27 @@ def _log_component_statistics(self): # 获取队列统计 queue_stats = self.components["message_queue_client"].get_statistics() - logger.info("=== Component Statistics ===") - logger.info( + print("=== Component Statistics ===") + print( f"Trainer - Steps: {trainer_stats['global_steps']}, " f"Samples: {trainer_stats['processed_samples']}, " f"Param version: {trainer_stats['current_param_version']}" ) - logger.info( + print( f"Rollouter - Generated: {rollouter_stats['total_generated_samples']}, " f"Dropped: {rollouter_stats['dropped_stale_samples']}, " f"Errors: {rollouter_stats['generation_errors']}" ) - logger.info( + print( f"Queue - Size: {queue_stats['queue_size']}, " f"Produced: {queue_stats['total_produced']}, " f"Consumed: {queue_stats['total_consumed']}" ) except Exception as e: - logger.error(f"Error getting component statistics: {e}") + print(f"Error getting component statistics: {e}") def _check_component_health(self): """检查组件健康状态""" @@ -442,43 +407,43 @@ def _check_component_health(self): rollouter_stats = ray.get(self.components["rollouter"].get_statistics.remote(), timeout=5.0) if not rollouter_stats["is_running"]: - logger.warning("Rollouter is not running!") + print("Rollouter is not running!") # 可以尝试重启或报告错误 except Exception as e: - logger.warning(f"Health check failed: {e}") + print(f"Health check failed: {e}") def _cleanup_resources(self): """清理资源""" - logger.info("Cleaning up resources...") + print("Cleaning up resources...") try: # 停止Rollouter if "rollouter" in self.components: - logger.info("Shutting down Rollouter...") + print("Shutting down Rollouter...") try: shutdown_future = self.components["rollouter"].shutdown.remote() ray.get(shutdown_future, timeout=10.0) except Exception as e: - logger.warning(f"Error shutting down Rollouter: {e}") + print(f"Error shutting down Rollouter: {e}") # 清理MessageQueue if "message_queue_client" in self.components: - logger.info("Cleaning up MessageQueue...") + print("Cleaning up MessageQueue...") try: self.components["message_queue_client"].shutdown() except Exception as e: - logger.warning(f"Error cleaning up MessageQueue: {e}") + print(f"Error cleaning up MessageQueue: {e}") # 清理参数同步器 if "param_synchronizer" in self.components: - logger.info("Cleaning up parameter synchronizer...") + print("Cleaning up parameter synchronizer...") # TODO: 添加参数同步器的清理逻辑 - logger.info("Resource cleanup completed") + print("Resource cleanup completed") except Exception as e: - logger.error(f"Error during cleanup: {e}") + print(f"Error during cleanup: {e}") def get_training_status(self) -> dict: """获取训练状态""" @@ -495,7 +460,7 @@ def get_training_status(self) -> dict: "rollouter_stats": rollouter_stats, } except Exception as e: - logger.error(f"Error getting training status: {e}") + print(f"Error getting training status: {e}") return {"status": "error", "error": str(e)} @@ -503,27 +468,9 @@ def get_training_status(self) -> dict: def main(config): """主入口函数""" from verl.trainer.main_ppo import run_ppo - # 确保异步训练配置存在 if not hasattr(config, "async_training"): - # 设置默认异步训练配置 - config.async_training = OmegaConf.create( - { - "staleness_threshold": 3, - "max_staleness_allowed": 5, - "max_queue_size": 1000, - "min_batch_count": 1, - "batch_timeout": 30.0, - "generation_timeout": 30.0, - "batch_generation_interval": 0.1, - "max_sync_retries": 3, - "sync_timeout": 30.0, - "sync_retry_delay": 1.0, - } - ) - logger.info("Using default async training configuration") - - logger.info("Starting fully async PPO training with improved architecture") + raise RuntimeError("must set async_training config") run_ppo(config, task_runner_class=FullyAsyncTaskRunner) diff --git a/recipe/fully_async_policy/fully_async_rollouter.py b/recipe/fully_async_policy/fully_async_rollouter.py index 6274237c6a8..59047db95f7 100644 --- a/recipe/fully_async_policy/fully_async_rollouter.py +++ b/recipe/fully_async_policy/fully_async_rollouter.py @@ -15,6 +15,7 @@ import threading import time from concurrent.futures import ThreadPoolExecutor +from pprint import pprint from typing import Optional import ray @@ -28,8 +29,6 @@ from verl.utils.debug import marked_timer from verl.utils.tracking import ValidationGenerationsLogger -logger = logging.getLogger(__name__) - @ray.remote(num_cpus=10, max_concurrency=10) class FullyAsyncRollouter(RayPPOTrainer): @@ -48,10 +47,6 @@ def __init__( processor=None, reward_fn=None, val_reward_fn=None, - train_dataset: Dataset | None = None, - val_dataset: Dataset | None = None, - collate_fn=None, - train_sampler: Sampler | None = None, device_name=None, ): """ @@ -73,7 +68,6 @@ def __init__( train_sampler (Optional[Sampler], optional): Sampler for the training dataset. Defaults to None. device_name (str, optional): Device name for training (e.g., "cuda", "cpu"). Defaults to None. """ - # Store the tokenizer for text processing self.tokenizer = tokenizer self.processor = processor @@ -99,7 +93,18 @@ def __init__( self.use_reference_policy = False self.use_rm = False + + # 创建数据集 + print("Creating datasets...") + from verl.trainer.main_ppo import create_rl_dataset, create_rl_sampler + from verl.utils.dataset.rl_dataset import collate_fn + + train_dataset = create_rl_dataset(config.data.train_files, config.data, tokenizer, processor) + val_dataset = create_rl_dataset(config.data.val_files, config.data, tokenizer, processor) + train_sampler = create_rl_sampler(config.data, train_dataset) + self._validate_config() + pprint(f"Rollouter _create_dataloader...\n{train_dataset}\n{val_dataset}") self._create_dataloader(train_dataset, val_dataset, collate_fn, train_sampler) # rollouter 参数配置 @@ -159,14 +164,6 @@ def _validate_config(self): if not hasattr(self.config, "async_training"): raise ValueError("Missing async_training configuration") - def init_workers(self): - """初始化rollout workers""" - with self.lock: - logger.info("Initializing Rollouter workers...") - self._init_resource_pools() - self.rollout_wg = self.all_wg["rollout"] - self.rollout_wg.init_model() - def _create_actor_rollout_classes(self): # only create rollout for role in [Role.Rollout]: @@ -183,17 +180,6 @@ def _init_models(self): self.rollout_wg.init_model() self.actor_rollout_wg = self.rollout_wg - def _init_async_rollout_manager(self): - # create async rollout manager and request scheduler - self.async_rollout_mode = False - if self.config.actor_rollout_ref.rollout.mode == "async": - from verl.experimental.agent_loop import AgentLoopManager - - self.async_rollout_mode = True - self.async_rollout_manager = AgentLoopManager( - config=self.config, - worker_group=self.actor_rollout_wg, - ) def update_rollout_weights(self, param_version: int) -> bool: """ @@ -206,11 +192,11 @@ def update_rollout_weights(self, param_version: int) -> bool: Returns: bool: 是否成功更新参数 """ - logger.info(f"Updating rollout weights to version {param_version}") + self.logger.info(f"Updating rollout weights to version {param_version}") with self.sync_lock: if self.sync_in_progress: - logger.warning(f"Sync already in progress, skipping version {param_version}") + self.logger.warning(f"Sync already in progress, skipping version {param_version}") return False self.sync_in_progress = True @@ -218,7 +204,7 @@ def update_rollout_weights(self, param_version: int) -> bool: try: # 暂停rollout - 带超时机制 if not self.rollout_controller.pause(timeout=10.0): - logger.error("Failed to pause rollout within timeout") + print("Failed to pause rollout within timeout") return False # 等待当前generation完成(如果有的话) @@ -231,12 +217,12 @@ def update_rollout_weights(self, param_version: int) -> bool: self.current_param_version = param_version self.param_sync_requests += 1 self.last_sync_time = time.time() - logger.info(f"Successfully updated rollout weights to version {param_version}") + self.logger.info(f"Successfully updated rollout weights to version {param_version}") else: - logger.error(f"Failed to sync parameters to version {param_version}") + print(f"Failed to sync parameters to version {param_version}") except Exception as e: - logger.error(f"Error during parameter sync: {e}") + print(f"Error during parameter sync: {e}") sync_success = False finally: # 恢复rollout @@ -268,15 +254,15 @@ def _execute_parameter_sync(self, param_version: int) -> bool: # 执行参数同步 if self.param_synchronizer: self.param_synchronizer.sync_weights() - logger.debug("Parameter synchronization completed via synchronizer") + self.logger.debug("Parameter synchronization completed via synchronizer") else: # 直接使用rollout worker group的同步机制 if hasattr(self.rollout_wg, "sync_rollout_weights"): sync_futures = self.rollout_wg.sync_rollout_weights() ray.get(sync_futures) - logger.debug("Parameter synchronization completed via rollout worker group") + self.logger.debug("Parameter synchronization completed via rollout worker group") else: - logger.warning("No parameter synchronization mechanism available") + self.logger.warning("No parameter synchronization mechanism available") return False # 恢复推理引擎 @@ -291,7 +277,7 @@ def _execute_parameter_sync(self, param_version: int) -> bool: return True except Exception as e: - logger.error(f"Parameter sync execution failed: {e}") + print(f"Parameter sync execution failed: {e}") return False def _create_continuous_iterator(self): @@ -305,7 +291,7 @@ def _create_continuous_iterator(self): def fit(self): """开始异步生成样本 - 改进的主运行逻辑""" - logger.info("Starting Rollouter...") + self.logger.info("Starting Rollouter...") if self.message_queue_client is None: raise ValueError("MessageQueue client not set. Call set_message_queue_client() first.") if self.param_synchronizer is None: @@ -328,7 +314,7 @@ def fit(self): self.generation_thread.join() self.monitor_thread.join() - logger.info("Rollouter fit completed") + self.logger.info("Rollouter fit completed") def _generation_loop(self): """ @@ -346,10 +332,10 @@ def _generation_loop(self): from verl.utils.tracking import Tracking - logger = Tracking( + self.logger = Tracking( project_name=self.config.trainer.project_name, experiment_name=self.config.trainer.experiment_name, - default_backend=self.config.trainer.logger, + default_backend=self.config.trainer.self.logger, config=OmegaConf.to_container(self.config, resolve=True), ) @@ -364,7 +350,7 @@ def _generation_loop(self): val_metrics = self._validate() assert val_metrics, f"{val_metrics=}" pprint(f"Initial validation metrics: {val_metrics}") - logger.log(data=val_metrics, step=self.global_steps) + self.logger.log(data=val_metrics, step=self.global_steps) if self.config.trainer.get("val_only", False): return @@ -400,7 +386,7 @@ def _generation_loop(self): # 如果被暂停,等待恢复 while self.paused and self.running: - logger.debug("Generation thread paused, waiting...") + self.logger.debug("Generation thread paused, waiting...") self.condition.wait() # 再次检查运行状态 @@ -441,7 +427,7 @@ def _generation_loop(self): if success: self.total_generated_samples += 1 if self.total_generated_samples % 10 == 0: - logger.info( + self.logger.info( f"Generated {self.total_generated_samples} batches, " f"param_version={self.current_param_version}, " f"errors={self.generation_errors}" @@ -449,7 +435,7 @@ def _generation_loop(self): else: self.dropped_stale_samples += 1 if self.dropped_stale_samples % 5 == 0: - logger.warning(f"Dropped stale samples: {self.dropped_stale_samples}") + self.logger.warning(f"Dropped stale samples: {self.dropped_stale_samples}") def _monitor_loop(self): """监控线程 - 监控状态并处理控制信号""" @@ -478,12 +464,12 @@ def _monitor_loop(self): if self.paused: self.paused = False self.condition.notify_all() - logger.info("Generation resumed") + self.logger.info("Generation resumed") except Exception as e: - logger.error(f"Error in monitor loop: {e}") + print(f"Error in monitor loop: {e}") finally: - logger.info("Monitor thread exiting") + self.logger.info("Monitor thread exiting") def _report_loop(self): try: @@ -504,14 +490,14 @@ def _report_loop(self): # 检查生成线程状态 if not self.generation_thread.is_alive(): - logger.error("Generation thread died, restarting...") + print("Generation thread died, restarting...") raise RuntimeError("generation_thread not alive") except KeyboardInterrupt: - logger.info("Received interrupt signal, shutting down...") + self.logger.info("Received interrupt signal, shutting down...") except Exception as e: - logger.error(f"Error in main loop: {e}") + print(f"Error in main loop: {e}") finally: self.shutdown() @@ -529,7 +515,7 @@ def _should_pause_generation(self) -> bool: # 如果版本差异过大,暂停生成 if version_diff >= self.max_staleness_allowed: - logger.debug( + self.logger.debug( f"Should pause due to staleness: rollout_version={self.current_param_version}, " f"trainer_version={current_trainer_version}, diff={version_diff}" ) @@ -538,13 +524,13 @@ def _should_pause_generation(self) -> bool: # 如果队列太满,也暂停生成 max_queue_size = self.staleness_threshold * self.config.data.train_batch_size if queue_size >= max_queue_size: - logger.debug(f"Should pause due to full queue: size={queue_size}, max={max_queue_size}") + self.logger.debug(f"Should pause due to full queue: size={queue_size}, max={max_queue_size}") return True return False except Exception as e: - logger.error(f"Error checking pause conditions: {e}") + print(f"Error checking pause conditions: {e}") return True # 出错时暂停生成 def pause(self) -> bool: @@ -570,12 +556,12 @@ def resume(self) -> bool: self.paused = False self.condition.notify_all() - logger.info("Generation resumed") + self.logger.info("Generation resumed") return True def shutdown(self): """关闭Rollouter - 改进的关闭逻辑""" - logger.info("Shutting down Rollouter...") + self.logger.info("Shutting down Rollouter...") with self.lock: self.running = False @@ -584,19 +570,19 @@ def shutdown(self): # 等待生成线程结束 if self.generation_thread and self.generation_thread.is_alive(): - logger.info("Waiting for generation thread to finish...") + self.logger.info("Waiting for generation thread to finish...") self.generation_thread.join(timeout=10.0) if self.generation_thread.is_alive(): - logger.warning("Generation thread did not finish within timeout") + self.logger.warning("Generation thread did not finish within timeout") # 等待监控线程结束 if self.monitor_thread and self.monitor_thread.is_alive(): - logger.info("Waiting for monitor thread to finish...") + self.logger.info("Waiting for monitor thread to finish...") self.monitor_thread.join(timeout=5.0) if self.monitor_thread.is_alive(): - logger.warning("Monitor thread did not finish within timeout") + self.logger.warning("Monitor thread did not finish within timeout") # 关闭线程池 if self.thread_executor: @@ -608,9 +594,9 @@ def shutdown(self): # TODO: 添加异步rollout管理器的清理逻辑 pass except Exception as e: - logger.warning(f"Error cleaning up async rollout manager: {e}") + self.logger.warning(f"Error cleaning up async rollout manager: {e}") - logger.info("Rollouter shutdown complete") + self.logger.info("Rollouter shutdown complete") def get_statistics(self) -> dict: with self.lock: diff --git a/recipe/fully_async_policy/fully_async_trainer.py b/recipe/fully_async_policy/fully_async_trainer.py index 0dd90127d7d..2a507e41efa 100644 --- a/recipe/fully_async_policy/fully_async_trainer.py +++ b/recipe/fully_async_policy/fully_async_trainer.py @@ -57,10 +57,6 @@ def __init__( processor=None, reward_fn=None, val_reward_fn=None, - train_dataset: Dataset | None = None, - val_dataset: Dataset | None = None, - collate_fn=None, - train_sampler: Sampler | None = None, device_name=None, ): """ @@ -125,7 +121,6 @@ def __init__( self.use_critic = False self._validate_config() - self._create_dataloader(train_dataset, val_dataset, collate_fn, train_sampler) self.message_queue_client = None def set_message_queue_client(self, message_queue_client: MessageQueueClient): From 274883a81ee9bf9c5beb9f4b51dc721c16eaa416 Mon Sep 17 00:00:00 2001 From: ArronHZG Date: Tue, 5 Aug 2025 16:10:32 +0800 Subject: [PATCH 021/182] gen data --- recipe/fully_async_policy/fully_async_main.py | 33 ++----- .../fully_async_rollouter.py | 88 +++++++++---------- .../fully_async_policy/fully_async_trainer.py | 5 -- recipe/fully_async_policy/message_queue.py | 13 ++- .../{ => unittest}/test_fully_async.py | 14 ++- recipe/fully_async_policy/unittest/test_mq.py | 35 ++++---- 6 files changed, 73 insertions(+), 115 deletions(-) rename recipe/fully_async_policy/{ => unittest}/test_fully_async.py (91%) diff --git a/recipe/fully_async_policy/fully_async_main.py b/recipe/fully_async_policy/fully_async_main.py index 54460b3611d..dd544e9b49a 100644 --- a/recipe/fully_async_policy/fully_async_main.py +++ b/recipe/fully_async_policy/fully_async_main.py @@ -12,7 +12,6 @@ # See the License for the specific language governing permissions and # limitations under the License. -import logging import os import signal import socket @@ -27,7 +26,6 @@ from recipe.fully_async_policy.fully_async_rollouter import FullyAsyncRollouter from recipe.fully_async_policy.fully_async_trainer import FullyAsyncTrainer from recipe.fully_async_policy.message_queue import MessageQueue, MessageQueueClient -from recipe.fully_async_policy.param_sync import AsyncParameterSynchronizer from verl.trainer.ppo.ray_trainer import ResourcePoolManager, Role from verl.trainer.ppo.reward import load_reward_manager from verl.utils.fs import copy_to_local @@ -159,9 +157,9 @@ def run(self, config): self._setup_signal_handlers() # 初始化基础组件 self._initialize_components(config) - time.sleep(60) + # time.sleep(60) # 启动训练流程 - # self._run_training_loop() + self._run_training_loop() # self._cleanup_resources() @@ -239,7 +237,7 @@ def _initialize_components(self, config) -> None: # 创建Trainer print("Creating FullyAsyncTrainer...") - self._create_trainer(config) + # self._create_trainer(config) # 设置参数同步 # print("Setting up parameter synchronization...") @@ -308,31 +306,13 @@ def _run_training_loop(self): print("Starting Rollouter in background...") rollouter_future = self.components["rollouter"].fit.remote() - trainer_future = self.components["trainer"].fit.remote() - self._monitor_components() + # trainer_future = self.components["trainer"].fit.remote() + # self._monitor_components() ray.get(rollouter_future) - ray.get(trainer_future) + # ray.get(trainer_future) print("Training completed or interrupted") - def _run_rollouter(self): - try: - ray.get(self.components["rollouter"].fit.remote()) - except Exception as e: - print(f"Rollouter error: {e}") - self.running = False - self.shutdown_event.set() - - def _run_trainer(self): - """运行trainer""" - try: - self.components["trainer"].fit() - except Exception as e: - print(f"Trainer error: {e}") - finally: - self.running = False - self.shutdown_event.set() - def _monitor_components(self): """监控组件状态""" print("Starting component monitoring...") @@ -468,6 +448,7 @@ def get_training_status(self) -> dict: def main(config): """主入口函数""" from verl.trainer.main_ppo import run_ppo + # 确保异步训练配置存在 if not hasattr(config, "async_training"): raise RuntimeError("must set async_training config") diff --git a/recipe/fully_async_policy/fully_async_rollouter.py b/recipe/fully_async_policy/fully_async_rollouter.py index 59047db95f7..f273b3a45a4 100644 --- a/recipe/fully_async_policy/fully_async_rollouter.py +++ b/recipe/fully_async_policy/fully_async_rollouter.py @@ -11,16 +11,13 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -import logging import threading import time from concurrent.futures import ThreadPoolExecutor from pprint import pprint -from typing import Optional import ray from omegaconf import OmegaConf -from torch.utils.data import Dataset, Sampler from tqdm import tqdm from recipe.fully_async_policy.message_queue import MessageQueueClient @@ -38,16 +35,16 @@ class FullyAsyncRollouter(RayPPOTrainer): """ def __init__( - self, - config, - tokenizer, - role_worker_mapping: dict[Role, WorkerType], - resource_pool_manager: ResourcePoolManager, - ray_worker_group_cls: RayWorkerGroup = RayWorkerGroup, - processor=None, - reward_fn=None, - val_reward_fn=None, - device_name=None, + self, + config, + tokenizer, + role_worker_mapping: dict[Role, WorkerType], + resource_pool_manager: ResourcePoolManager, + ray_worker_group_cls: RayWorkerGroup = RayWorkerGroup, + processor=None, + reward_fn=None, + val_reward_fn=None, + device_name=None, ): """ Initialize distributed PPO trainer with Ray backend. @@ -93,7 +90,6 @@ def __init__( self.use_reference_policy = False self.use_rm = False - # 创建数据集 print("Creating datasets...") from verl.trainer.main_ppo import create_rl_dataset, create_rl_sampler @@ -180,7 +176,6 @@ def _init_models(self): self.rollout_wg.init_model() self.actor_rollout_wg = self.rollout_wg - def update_rollout_weights(self, param_version: int) -> bool: """ 更新rollout模型参数 - 改进的参数同步实现 @@ -192,11 +187,11 @@ def update_rollout_weights(self, param_version: int) -> bool: Returns: bool: 是否成功更新参数 """ - self.logger.info(f"Updating rollout weights to version {param_version}") + print(f"Updating rollout weights to version {param_version}") with self.sync_lock: if self.sync_in_progress: - self.logger.warning(f"Sync already in progress, skipping version {param_version}") + print(f"Sync already in progress, skipping version {param_version}") return False self.sync_in_progress = True @@ -217,7 +212,7 @@ def update_rollout_weights(self, param_version: int) -> bool: self.current_param_version = param_version self.param_sync_requests += 1 self.last_sync_time = time.time() - self.logger.info(f"Successfully updated rollout weights to version {param_version}") + print(f"Successfully updated rollout weights to version {param_version}") else: print(f"Failed to sync parameters to version {param_version}") @@ -254,15 +249,15 @@ def _execute_parameter_sync(self, param_version: int) -> bool: # 执行参数同步 if self.param_synchronizer: self.param_synchronizer.sync_weights() - self.logger.debug("Parameter synchronization completed via synchronizer") + print("Parameter synchronization completed via synchronizer") else: # 直接使用rollout worker group的同步机制 if hasattr(self.rollout_wg, "sync_rollout_weights"): sync_futures = self.rollout_wg.sync_rollout_weights() ray.get(sync_futures) - self.logger.debug("Parameter synchronization completed via rollout worker group") + print("Parameter synchronization completed via rollout worker group") else: - self.logger.warning("No parameter synchronization mechanism available") + print("No parameter synchronization mechanism available") return False # 恢复推理引擎 @@ -291,11 +286,11 @@ def _create_continuous_iterator(self): def fit(self): """开始异步生成样本 - 改进的主运行逻辑""" - self.logger.info("Starting Rollouter...") + print("Starting Rollouter...") if self.message_queue_client is None: raise ValueError("MessageQueue client not set. Call set_message_queue_client() first.") - if self.param_synchronizer is None: - raise ValueError("param_synchronizer client not set. Call set_parameter_synchronizer() first.") + # if self.param_synchronizer is None: + # raise ValueError("param_synchronizer client not set. Call set_parameter_synchronizer() first.") # 设置运行状态 with self.lock: @@ -314,7 +309,7 @@ def fit(self): self.generation_thread.join() self.monitor_thread.join() - self.logger.info("Rollouter fit completed") + print("Rollouter fit completed") def _generation_loop(self): """ @@ -335,7 +330,7 @@ def _generation_loop(self): self.logger = Tracking( project_name=self.config.trainer.project_name, experiment_name=self.config.trainer.experiment_name, - default_backend=self.config.trainer.self.logger, + default_backend=self.config.trainer.logger, config=OmegaConf.to_container(self.config, resolve=True), ) @@ -386,7 +381,7 @@ def _generation_loop(self): # 如果被暂停,等待恢复 while self.paused and self.running: - self.logger.debug("Generation thread paused, waiting...") + print("Generation thread paused, waiting...") self.condition.wait() # 再次检查运行状态 @@ -413,21 +408,19 @@ def _generation_loop(self): "timing": timing_raw, "generation_timestamp": time.time(), "rollout_param_version": self.current_param_version, - "epoch": epoch, } # 放入队列 success = self.message_queue_client.put_samples( - epoch=epoch, - sample=gen_batch_output, + samples=gen_batch_output, param_version=self.current_param_version, - rollout_metadata=rollout_metadata, + rollout_metadata_list=rollout_metadata, ) with self.lock: if success: self.total_generated_samples += 1 if self.total_generated_samples % 10 == 0: - self.logger.info( + print( f"Generated {self.total_generated_samples} batches, " f"param_version={self.current_param_version}, " f"errors={self.generation_errors}" @@ -435,7 +428,7 @@ def _generation_loop(self): else: self.dropped_stale_samples += 1 if self.dropped_stale_samples % 5 == 0: - self.logger.warning(f"Dropped stale samples: {self.dropped_stale_samples}") + print(f"Dropped stale samples: {self.dropped_stale_samples}") def _monitor_loop(self): """监控线程 - 监控状态并处理控制信号""" @@ -459,17 +452,17 @@ def _monitor_loop(self): last_stats_time = current_time # 检查是否应该恢复生成 - if self._should_resume_generation(): + if not self._should_pause_generation(): with self.lock: if self.paused: self.paused = False self.condition.notify_all() - self.logger.info("Generation resumed") + print("Generation resumed") except Exception as e: print(f"Error in monitor loop: {e}") finally: - self.logger.info("Monitor thread exiting") + print("Monitor thread exiting") def _report_loop(self): try: @@ -493,9 +486,8 @@ def _report_loop(self): print("Generation thread died, restarting...") raise RuntimeError("generation_thread not alive") - except KeyboardInterrupt: - self.logger.info("Received interrupt signal, shutting down...") + print("Received interrupt signal, shutting down...") except Exception as e: print(f"Error in main loop: {e}") finally: @@ -515,7 +507,7 @@ def _should_pause_generation(self) -> bool: # 如果版本差异过大,暂停生成 if version_diff >= self.max_staleness_allowed: - self.logger.debug( + print( f"Should pause due to staleness: rollout_version={self.current_param_version}, " f"trainer_version={current_trainer_version}, diff={version_diff}" ) @@ -524,7 +516,7 @@ def _should_pause_generation(self) -> bool: # 如果队列太满,也暂停生成 max_queue_size = self.staleness_threshold * self.config.data.train_batch_size if queue_size >= max_queue_size: - self.logger.debug(f"Should pause due to full queue: size={queue_size}, max={max_queue_size}") + print(f"Should pause due to full queue: size={queue_size}, max={max_queue_size}") return True return False @@ -556,12 +548,12 @@ def resume(self) -> bool: self.paused = False self.condition.notify_all() - self.logger.info("Generation resumed") + print("Generation resumed") return True def shutdown(self): """关闭Rollouter - 改进的关闭逻辑""" - self.logger.info("Shutting down Rollouter...") + print("Shutting down Rollouter...") with self.lock: self.running = False @@ -570,19 +562,19 @@ def shutdown(self): # 等待生成线程结束 if self.generation_thread and self.generation_thread.is_alive(): - self.logger.info("Waiting for generation thread to finish...") + print("Waiting for generation thread to finish...") self.generation_thread.join(timeout=10.0) if self.generation_thread.is_alive(): - self.logger.warning("Generation thread did not finish within timeout") + print("Generation thread did not finish within timeout") # 等待监控线程结束 if self.monitor_thread and self.monitor_thread.is_alive(): - self.logger.info("Waiting for monitor thread to finish...") + print("Waiting for monitor thread to finish...") self.monitor_thread.join(timeout=5.0) if self.monitor_thread.is_alive(): - self.logger.warning("Monitor thread did not finish within timeout") + print("Monitor thread did not finish within timeout") # 关闭线程池 if self.thread_executor: @@ -594,9 +586,9 @@ def shutdown(self): # TODO: 添加异步rollout管理器的清理逻辑 pass except Exception as e: - self.logger.warning(f"Error cleaning up async rollout manager: {e}") + print(f"Error cleaning up async rollout manager: {e}") - self.logger.info("Rollouter shutdown complete") + print("Rollouter shutdown complete") def get_statistics(self) -> dict: with self.lock: diff --git a/recipe/fully_async_policy/fully_async_trainer.py b/recipe/fully_async_policy/fully_async_trainer.py index 2a507e41efa..1a3076f19c2 100644 --- a/recipe/fully_async_policy/fully_async_trainer.py +++ b/recipe/fully_async_policy/fully_async_trainer.py @@ -20,12 +20,10 @@ import numpy as np import ray from omegaconf import OmegaConf -from torch.utils.data import Dataset, Sampler from tqdm import tqdm from recipe.fully_async_policy.message_queue import MessageQueueClient, QueueSample from verl.single_controller.ray import RayClassWithInitArgs, RayWorkerGroup -from verl.single_controller.ray.base import create_colocated_worker_cls from verl.trainer.ppo import core_algos from verl.trainer.ppo.core_algos import AdvantageEstimator from verl.trainer.ppo.ray_trainer import ( @@ -155,7 +153,6 @@ def _init_models(self): self.actor_wg.init_model() self.actor_rollout_wg = self.actor_wg # to be compatible with the functions that not be modified - def fit(self): """ The training loop of PPO. @@ -168,8 +165,6 @@ def fit(self): if self.message_queue_client is None: raise ValueError("MessageQueue client not set. Call set_message_queue_client() first.") - from omegaconf import OmegaConf - from verl.utils.tracking import Tracking logger = Tracking( diff --git a/recipe/fully_async_policy/message_queue.py b/recipe/fully_async_policy/message_queue.py index 06f0d2cbbe9..dcd3c27ed15 100644 --- a/recipe/fully_async_policy/message_queue.py +++ b/recipe/fully_async_policy/message_queue.py @@ -31,7 +31,6 @@ class QueueSample: """单个batch样本,包含参数版本和新鲜度信息""" id: str - epoch: int data: Any param_version: int timestamp: float @@ -77,13 +76,12 @@ def __init__(self, config: DictConfig, max_queue_size: int = 1000): ) def put_samples( - self, epoch: int, samples: list[Any], param_version: int, rollout_metadata_list: list[dict[str, Any]] = None + self, samples: list[Any], param_version: int, rollout_metadata_list: list[dict[str, Any]] = None ) -> bool: """ 放入一个batch样本到队列 Args: - epoch: 当前epoch samples: 样本数据 param_version: 参数版本号 rollout_metadata_list: rollout相关的元数据 @@ -110,7 +108,6 @@ def put_samples( for sample, meta in zip(samples, rollout_metadata_list, strict=False): queue_sample = QueueSample( id=str(uuid.uuid4()), - epoch=epoch, data=sample, param_version=param_version, timestamp=time.time(), @@ -237,13 +234,13 @@ class MessageQueueClient: def __init__(self, queue_actor: Any): self.queue_actor = queue_actor - def put_batch( - self, epoch: int, batch: list[Any], param_version: int, rollout_metadata_list: list[dict[str, Any]] = None + def put_samples( + self, samples: list[Any], param_version: int, rollout_metadata_list: list[dict[str, Any]] = None ) -> bool: """放入batch到队列""" - return ray.get(self.queue_actor.put_samples.remote(epoch, batch, param_version, rollout_metadata_list)) + return ray.get(self.queue_actor.put_samples.remote(samples, param_version, rollout_metadata_list)) - def get_batch(self, min_batch_count: int = 1) -> list[QueueSample]: + def get_samples(self, min_batch_count: int = 1) -> list[QueueSample]: """从队列获取batch,一直等待直到有足够样本""" return ray.get(self.queue_actor.get_samples.remote(min_batch_count)) diff --git a/recipe/fully_async_policy/test_fully_async.py b/recipe/fully_async_policy/unittest/test_fully_async.py similarity index 91% rename from recipe/fully_async_policy/test_fully_async.py rename to recipe/fully_async_policy/unittest/test_fully_async.py index c138debcaa0..a6646b17575 100644 --- a/recipe/fully_async_policy/test_fully_async.py +++ b/recipe/fully_async_policy/unittest/test_fully_async.py @@ -61,14 +61,13 @@ def test_basic_put_get(self): mock_batch = Mock(spec=DataProto) # 放入样本 - success = self.client.put_batch(epoch=0, batch=mock_batch, param_version=1, rollout_metadata={"test": "data"}) + success = self.client.put_samples(samples=mock_batch, param_version=1, rollout_metadata={"test": "data"}) self.assertTrue(success) # 获取样本 - samples = self.client.get_batch(min_batch_count=1, timeout=5.0) + samples = self.client.get_samples(min_batch_count=1, timeout=5.0) self.assertIsNotNone(samples) self.assertEqual(len(samples), 1) - self.assertEqual(samples[0].epoch, 0) self.assertEqual(samples[0].param_version, 1) def test_freshness_control(self): @@ -79,9 +78,8 @@ def test_freshness_control(self): self.client.update_param_version(10) # 尝试放入过期样本 - success = self.client.put_batch( - epoch=0, - batch=mock_batch, + success = self.client.put_samples( + samples=mock_batch, param_version=5, # 版本差异为5,超过阈值3 rollout_metadata={}, ) @@ -161,11 +159,11 @@ def test_integration(): # 生产样本 for i in range(5): - success = client.put_batch(epoch=i, batch=mock_batch, param_version=i, rollout_metadata={"batch_id": i}) + success = client.put_samples(samples=mock_batch, param_version=i, rollout_metadata={"batch_id": i}) assert success, f"Failed to put batch {i}" # 消费样本 - samples = client.get_batch(min_batch_count=3, timeout=10.0) + samples = client.get_samples(min_batch_count=3, timeout=10.0) assert samples is not None, "Failed to get samples" assert len(samples) == 3, f"Expected 3 samples, got {len(samples)}" diff --git a/recipe/fully_async_policy/unittest/test_mq.py b/recipe/fully_async_policy/unittest/test_mq.py index 52a9f17d8ae..02e9839bcfd 100644 --- a/recipe/fully_async_policy/unittest/test_mq.py +++ b/recipe/fully_async_policy/unittest/test_mq.py @@ -66,9 +66,7 @@ def test_put_samples_success(self, message_queue_client, mock_data_proto): samples = [mock_data_proto, mock_data_proto] metadata_list = [{"test": "data1"}, {"test": "data2"}] - result = message_queue_client.put_batch( - epoch=1, batch=samples, param_version=1, rollout_metadata_list=metadata_list - ) + result = message_queue_client.put_samples(samples=samples, param_version=1, rollout_metadata_list=metadata_list) assert result is True @@ -85,7 +83,7 @@ def test_put_samples_without_metadata(self, message_queue_client, mock_data_prot """测试不提供metadata时的处理""" samples = [mock_data_proto, mock_data_proto] - result = message_queue_client.put_batch(epoch=1, batch=samples, param_version=1, rollout_metadata_list=None) + result = message_queue_client.put_samples(samples=samples, param_version=1, rollout_metadata_list=None) assert result is True queue_size = message_queue_client.get_queue_size() @@ -96,9 +94,7 @@ def test_put_samples_metadata_mismatch(self, message_queue_client, mock_data_pro samples = [mock_data_proto, mock_data_proto] metadata_list = [{"test": "data1"}] # 长度不匹配 - result = message_queue_client.put_batch( - epoch=1, batch=samples, param_version=1, rollout_metadata_list=metadata_list - ) + result = message_queue_client.put_samples(samples=samples, param_version=1, rollout_metadata_list=metadata_list) assert result is False # 应该失败 queue_size = message_queue_client.get_queue_size() @@ -111,9 +107,8 @@ def test_put_samples_staleness_check(self, message_queue_client, mock_data_proto # 尝试放入版本过旧的batch(版本差异>=3会被拒绝) samples = [mock_data_proto] - result = message_queue_client.put_batch( - epoch=1, - batch=samples, + result = message_queue_client.put_samples( + samples=samples, param_version=2, # 5-2=3, 达到阈值 rollout_metadata_list=None, ) @@ -129,7 +124,7 @@ def test_put_samples_queue_overflow(self, message_queue_client, mock_data_proto) # 填满队列(最大容量10) for i in range(6): # 每次放入2个,总共12个,超过最大容量10 samples = [mock_data_proto, mock_data_proto] - message_queue_client.put_batch(epoch=1, batch=samples, param_version=1, rollout_metadata_list=None) + message_queue_client.put_samples(samples=samples, param_version=1, rollout_metadata_list=None) # 队列大小应该保持在最大值 queue_size = message_queue_client.get_queue_size() @@ -144,10 +139,10 @@ def test_get_samples_success(self, message_queue_client, mock_data_proto): # 先放入一些samples samples = [mock_data_proto, mock_data_proto, mock_data_proto] metadata_list = [{"index": 0}, {"index": 1}, {"index": 2}] - message_queue_client.put_batch(epoch=1, batch=samples, param_version=1, rollout_metadata_list=metadata_list) + message_queue_client.put_samples(samples=samples, param_version=1, rollout_metadata_list=metadata_list) # 获取2个samples - retrieved_samples = message_queue_client.get_batch(min_batch_count=2) + retrieved_samples = message_queue_client.get_samples(min_batch_count=2) assert retrieved_samples is not None assert len(retrieved_samples) == 2 @@ -167,13 +162,13 @@ def test_get_samples_blocking_behavior(self, message_queue_client, mock_data_pro def get_samples(): # 这会阻塞直到有足够样本 - samples = message_queue_client.get_batch(min_batch_count=2) + samples = message_queue_client.get_samples(min_batch_count=2) result.append(samples) def put_samples_later(): time.sleep(0.5) # 延迟放入 samples = [mock_data_proto, mock_data_proto] - message_queue_client.put_batch(epoch=1, batch=samples, param_version=1, rollout_metadata_list=None) + message_queue_client.put_samples(samples=samples, param_version=1, rollout_metadata_list=None) # 启动消费者线程 consumer_thread = threading.Thread(target=get_samples) @@ -199,7 +194,7 @@ def test_clear_queue(self, message_queue_client, mock_data_proto): """测试清空队列""" # 先添加一些样本 samples = [mock_data_proto, mock_data_proto, mock_data_proto] - message_queue_client.put_batch(epoch=1, batch=samples, param_version=1, rollout_metadata_list=None) + message_queue_client.put_samples(samples=samples, param_version=1, rollout_metadata_list=None) # 清空队列 message_queue_client.clear_queue() @@ -213,7 +208,7 @@ def test_get_queue_size(self, message_queue_client, mock_data_proto): assert message_queue_client.get_queue_size() == 0 samples = [mock_data_proto] - message_queue_client.put_batch(epoch=1, batch=samples, param_version=1, rollout_metadata_list=None) + message_queue_client.put_samples(samples=samples, param_version=1, rollout_metadata_list=None) assert message_queue_client.get_queue_size() == 1 def test_get_statistics(self, message_queue_client): @@ -238,7 +233,7 @@ def test_get_memory_usage(self, message_queue_client, mock_data_proto): """测试获取内存使用统计""" # 添加一些样本 samples = [mock_data_proto, mock_data_proto] - message_queue_client.put_batch(epoch=1, batch=samples, param_version=1, rollout_metadata_list=None) + message_queue_client.put_samples(samples=samples, param_version=1, rollout_metadata_list=None) memory_stats = message_queue_client.get_memory_usage() @@ -287,14 +282,14 @@ def test_concurrent_put_get(self, mock_data_proto): def producer(): for i in range(50): samples = [mock_data_proto, mock_data_proto] - result = client.put_batch(epoch=i, batch=samples, param_version=1, rollout_metadata_list=None) + result = client.put_samples(samples=samples, param_version=1, rollout_metadata_list=None) results.append(("put", result)) time.sleep(0.1) def consumer(): for _ in range(100): try: - retrieved_samples = client.get_batch(min_batch_count=1) + retrieved_samples = client.get_samples(min_batch_count=1) results.append(("get", len(retrieved_samples) > 0)) except Exception as e: print(e) From f653a8eb39d5c39b11d8d7aa45074af420e2bf32 Mon Sep 17 00:00:00 2001 From: ArronHZG Date: Tue, 5 Aug 2025 17:32:58 +0800 Subject: [PATCH 022/182] gen data to queue --- .../config/fully_async_ppo_trainer.yaml | 12 +- recipe/fully_async_policy/fully_async_main.py | 2 +- .../fully_async_rollouter.py | 258 +++++++++--------- .../fully_async_policy/fully_async_trainer.py | 4 - recipe/fully_async_policy/message_queue.py | 18 +- .../run_fully_async_example.sh | 2 - 6 files changed, 141 insertions(+), 155 deletions(-) diff --git a/recipe/fully_async_policy/config/fully_async_ppo_trainer.yaml b/recipe/fully_async_policy/config/fully_async_ppo_trainer.yaml index d97484d88f4..f9aa06cd4b6 100644 --- a/recipe/fully_async_policy/config/fully_async_ppo_trainer.yaml +++ b/recipe/fully_async_policy/config/fully_async_ppo_trainer.yaml @@ -11,16 +11,6 @@ defaults: async_training: # 新鲜度控制 (Freshness Control) staleness_threshold: 3 # 样本新鲜度阈值 - max_staleness_allowed: 5 # 最大允许的样本陈旧度 - - # 队列管理 (Queue Management) - max_queue_size: 1000 # 消息队列最大大小 - min_batch_count: 1 # 每次获取的最小batch数量 - batch_timeout: 30.0 # 获取batch的超时时间(秒) - - # 生成控制 (Generation Control) - generation_timeout: 30.0 # 单次生成的超时时间(秒) - batch_generation_interval: 0.1 # batch生成间隔(秒) # 参数同步 (Parameter Synchronization) max_sync_retries: 3 # 参数同步最大重试次数 @@ -35,3 +25,5 @@ rollout: name: vllm # rollout引擎: vllm, sglang n: 4 # 每个prompt生成的响应数量 +data: + gen_batch_size: 32 diff --git a/recipe/fully_async_policy/fully_async_main.py b/recipe/fully_async_policy/fully_async_main.py index dd544e9b49a..d7079d4af2b 100644 --- a/recipe/fully_async_policy/fully_async_main.py +++ b/recipe/fully_async_policy/fully_async_main.py @@ -224,7 +224,7 @@ def _initialize_components(self, config) -> None: # 创建MessageQueue print("Creating MessageQueue...") - max_queue_size = config.async_training.get("max_queue_size", 1000) + max_queue_size = config.async_training.staleness_threshold * config.data.train_batch_size message_queue = MessageQueue.remote(config, max_queue_size) message_queue_client = MessageQueueClient(message_queue) diff --git a/recipe/fully_async_policy/fully_async_rollouter.py b/recipe/fully_async_policy/fully_async_rollouter.py index f273b3a45a4..9196dc08e94 100644 --- a/recipe/fully_async_policy/fully_async_rollouter.py +++ b/recipe/fully_async_policy/fully_async_rollouter.py @@ -21,6 +21,7 @@ from tqdm import tqdm from recipe.fully_async_policy.message_queue import MessageQueueClient +from verl import DataProto from verl.single_controller.ray import RayClassWithInitArgs, RayWorkerGroup from verl.trainer.ppo.ray_trainer import RayPPOTrainer, ResourcePoolManager, Role, WorkerType from verl.utils.debug import marked_timer @@ -35,16 +36,16 @@ class FullyAsyncRollouter(RayPPOTrainer): """ def __init__( - self, - config, - tokenizer, - role_worker_mapping: dict[Role, WorkerType], - resource_pool_manager: ResourcePoolManager, - ray_worker_group_cls: RayWorkerGroup = RayWorkerGroup, - processor=None, - reward_fn=None, - val_reward_fn=None, - device_name=None, + self, + config, + tokenizer, + role_worker_mapping: dict[Role, WorkerType], + resource_pool_manager: ResourcePoolManager, + ray_worker_group_cls: RayWorkerGroup = RayWorkerGroup, + processor=None, + reward_fn=None, + val_reward_fn=None, + device_name=None, ): """ Initialize distributed PPO trainer with Ray backend. @@ -111,8 +112,6 @@ def __init__( # 新鲜度控制 - 改进的配置管理 async_config = config.async_training self.staleness_threshold = async_config.get("staleness_threshold", 3) - self.max_staleness_allowed = async_config.get("max_staleness_allowed", 5) - self.generation_timeout = async_config.get("generation_timeout", 30.0) # 统计信息 self.total_generated_samples = 0 @@ -145,6 +144,8 @@ def __init__( self.sync_in_progress = False self.sync_lock = threading.Lock() + self.max_queue_size = self.staleness_threshold * self.config.data.train_batch_size * self.config.actor_rollout_ref.rollout.n + def set_message_queue_client(self, message_queue_client: MessageQueueClient): """设置消息队列客户端""" with self.lock: @@ -176,105 +177,6 @@ def _init_models(self): self.rollout_wg.init_model() self.actor_rollout_wg = self.rollout_wg - def update_rollout_weights(self, param_version: int) -> bool: - """ - 更新rollout模型参数 - 改进的参数同步实现 - 这个方法由外部Trainer调用 - - Args: - param_version: 新的参数版本号 - - Returns: - bool: 是否成功更新参数 - """ - print(f"Updating rollout weights to version {param_version}") - - with self.sync_lock: - if self.sync_in_progress: - print(f"Sync already in progress, skipping version {param_version}") - return False - - self.sync_in_progress = True - - try: - # 暂停rollout - 带超时机制 - if not self.rollout_controller.pause(timeout=10.0): - print("Failed to pause rollout within timeout") - return False - - # 等待当前generation完成(如果有的话) - time.sleep(0.1) - - # 执行参数同步 - sync_success = self._execute_parameter_sync(param_version) - - if sync_success: - self.current_param_version = param_version - self.param_sync_requests += 1 - self.last_sync_time = time.time() - print(f"Successfully updated rollout weights to version {param_version}") - else: - print(f"Failed to sync parameters to version {param_version}") - - except Exception as e: - print(f"Error during parameter sync: {e}") - sync_success = False - finally: - # 恢复rollout - self.rollout_controller.resume() - self.sync_in_progress = False - - return sync_success - - def _execute_parameter_sync(self, param_version: int) -> bool: - """ - 执行实际的参数同步 - 改进的同步逻辑 - - Args: - param_version: 目标参数版本 - - Returns: - bool: 是否同步成功 - """ - try: - # 暂停推理引擎 - if self.async_rollout_mode and hasattr(self, "async_rollout_manager"): - # 对于异步模式,暂停服务器 - pass # 异步服务器的暂停在 pause() 中已经处理 - else: - # 对于同步模式,使用sleep/wake_up机制 - sleep_futures = self.rollout_wg.sleep() - ray.get(sleep_futures) - - # 执行参数同步 - if self.param_synchronizer: - self.param_synchronizer.sync_weights() - print("Parameter synchronization completed via synchronizer") - else: - # 直接使用rollout worker group的同步机制 - if hasattr(self.rollout_wg, "sync_rollout_weights"): - sync_futures = self.rollout_wg.sync_rollout_weights() - ray.get(sync_futures) - print("Parameter synchronization completed via rollout worker group") - else: - print("No parameter synchronization mechanism available") - return False - - # 恢复推理引擎 - if self.async_rollout_mode and hasattr(self, "async_rollout_manager"): - # 对于异步模式,恢复服务器 - pass # 异步服务器的恢复在 resume() 中已经处理 - else: - # 对于同步模式,唤醒workers - wake_futures = self.rollout_wg.wake_up() - ray.get(wake_futures) - - return True - - except Exception as e: - print(f"Parameter sync execution failed: {e}") - return False - def _create_continuous_iterator(self): """ Create a continuous data iterator across epoch @@ -349,9 +251,6 @@ def _generation_loop(self): if self.config.trainer.get("val_only", False): return - # add tqdm - progress_bar = tqdm(total=self.total_training_steps, initial=self.global_steps, desc="Training Progress") - # we start from step 1 self.global_steps += 1 last_val_metrics = None @@ -409,26 +308,36 @@ def _generation_loop(self): "generation_timestamp": time.time(), "rollout_param_version": self.current_param_version, } + + gen_batch_output: DataProto = gen_batch_output + print(gen_batch_output) + for i in gen_batch_output: + print(i) + # 放入队列 success = self.message_queue_client.put_samples( samples=gen_batch_output, param_version=self.current_param_version, rollout_metadata_list=rollout_metadata, ) - + print(f"put samples {success}") with self.lock: if success: self.total_generated_samples += 1 - if self.total_generated_samples % 10 == 0: - print( - f"Generated {self.total_generated_samples} batches, " - f"param_version={self.current_param_version}, " - f"errors={self.generation_errors}" - ) else: self.dropped_stale_samples += 1 - if self.dropped_stale_samples % 5 == 0: - print(f"Dropped stale samples: {self.dropped_stale_samples}") + + if self.global_steps % 1 == 0: + print(f"Generated {self.total_generated_samples} batches, \n" + f"param_version={self.current_param_version}, \n" + f"errors={self.generation_errors}, \n" + f"Dropped stale samples: {self.dropped_stale_samples}\n") + + self.global_steps += 1 + + if is_last_step: + pprint(f"Final validation metrics: {last_val_metrics}") + return def _monitor_loop(self): """监控线程 - 监控状态并处理控制信号""" @@ -506,7 +415,7 @@ def _should_pause_generation(self) -> bool: version_diff = self.current_param_version - current_trainer_version # 如果版本差异过大,暂停生成 - if version_diff >= self.max_staleness_allowed: + if version_diff >= self.staleness_threshold: print( f"Should pause due to staleness: rollout_version={self.current_param_version}, " f"trainer_version={current_trainer_version}, diff={version_diff}" @@ -514,7 +423,7 @@ def _should_pause_generation(self) -> bool: return True # 如果队列太满,也暂停生成 - max_queue_size = self.staleness_threshold * self.config.data.train_batch_size + if queue_size >= max_queue_size: print(f"Should pause due to full queue: size={queue_size}, max={max_queue_size}") return True @@ -604,3 +513,102 @@ def get_statistics(self) -> dict: "queue_size": f"{queue_stats['queue_size']}", } return stats + + def update_rollout_weights(self, param_version: int) -> bool: + """ + 更新rollout模型参数 - 改进的参数同步实现 + 这个方法由外部Trainer调用 + + Args: + param_version: 新的参数版本号 + + Returns: + bool: 是否成功更新参数 + """ + print(f"Updating rollout weights to version {param_version}") + + with self.sync_lock: + if self.sync_in_progress: + print(f"Sync already in progress, skipping version {param_version}") + return False + + self.sync_in_progress = True + + try: + # 暂停rollout - 带超时机制 + if not self.rollout_controller.pause(timeout=10.0): + print("Failed to pause rollout within timeout") + return False + + # 等待当前generation完成(如果有的话) + time.sleep(0.1) + + # 执行参数同步 + sync_success = self._execute_parameter_sync(param_version) + + if sync_success: + self.current_param_version = param_version + self.param_sync_requests += 1 + self.last_sync_time = time.time() + print(f"Successfully updated rollout weights to version {param_version}") + else: + print(f"Failed to sync parameters to version {param_version}") + + except Exception as e: + print(f"Error during parameter sync: {e}") + sync_success = False + finally: + # 恢复rollout + self.rollout_controller.resume() + self.sync_in_progress = False + + return sync_success + + def _execute_parameter_sync(self, param_version: int) -> bool: + """ + 执行实际的参数同步 - 改进的同步逻辑 + + Args: + param_version: 目标参数版本 + + Returns: + bool: 是否同步成功 + """ + try: + # 暂停推理引擎 + if self.async_rollout_mode and hasattr(self, "async_rollout_manager"): + # 对于异步模式,暂停服务器 + pass # 异步服务器的暂停在 pause() 中已经处理 + else: + # 对于同步模式,使用sleep/wake_up机制 + sleep_futures = self.rollout_wg.sleep() + ray.get(sleep_futures) + + # 执行参数同步 + if self.param_synchronizer: + self.param_synchronizer.sync_weights() + print("Parameter synchronization completed via synchronizer") + else: + # 直接使用rollout worker group的同步机制 + if hasattr(self.rollout_wg, "sync_rollout_weights"): + sync_futures = self.rollout_wg.sync_rollout_weights() + ray.get(sync_futures) + print("Parameter synchronization completed via rollout worker group") + else: + print("No parameter synchronization mechanism available") + return False + + # 恢复推理引擎 + if self.async_rollout_mode and hasattr(self, "async_rollout_manager"): + # 对于异步模式,恢复服务器 + pass # 异步服务器的恢复在 resume() 中已经处理 + else: + # 对于同步模式,唤醒workers + wake_futures = self.rollout_wg.wake_up() + ray.get(wake_futures) + + return True + + except Exception as e: + print(f"Parameter sync execution failed: {e}") + return False diff --git a/recipe/fully_async_policy/fully_async_trainer.py b/recipe/fully_async_policy/fully_async_trainer.py index 1a3076f19c2..db6bdfeaebc 100644 --- a/recipe/fully_async_policy/fully_async_trainer.py +++ b/recipe/fully_async_policy/fully_async_trainer.py @@ -93,10 +93,6 @@ def __init__( self.use_rm = Role.RewardModel in role_worker_mapping self.ray_worker_group_cls = ray_worker_group_cls self.device_name = device_name if device_name else self.config.trainer.device - self.validation_generations_logger = ValidationGenerationsLogger( - project_name=self.config.trainer.project_name, - experiment_name=self.config.trainer.experiment_name, - ) # if ref_in_actor is True, the reference policy will be actor without lora applied self.ref_in_actor = config.actor_rollout_ref.model.get("lora_rank", 0) > 0 diff --git a/recipe/fully_async_policy/message_queue.py b/recipe/fully_async_policy/message_queue.py index dcd3c27ed15..61723cde953 100644 --- a/recipe/fully_async_policy/message_queue.py +++ b/recipe/fully_async_policy/message_queue.py @@ -76,7 +76,7 @@ def __init__(self, config: DictConfig, max_queue_size: int = 1000): ) def put_samples( - self, samples: list[Any], param_version: int, rollout_metadata_list: list[dict[str, Any]] = None + self, samples: list[Any] | Any, param_version: int, rollout_metadata: dict[str, Any] = None ) -> bool: """ 放入一个batch样本到队列 @@ -84,7 +84,7 @@ def put_samples( Args: samples: 样本数据 param_version: 参数版本号 - rollout_metadata_list: rollout相关的元数据 + rollout_metadata: rollout相关的元数据 Returns: bool: 是否成功放入队列 @@ -97,21 +97,13 @@ def put_samples( logger.debug(f"Dropped stale sample: staleness={staleness}, threshold={self.staleness_threshold}") return False - # 处理 rollout_metadatas 为 None 的情况 - if rollout_metadata_list is None: - rollout_metadata_list = [{}] * len(samples) - - if len(rollout_metadata_list) != len(samples): - logger.warning(f"len(rollout_metadata_list):{len(rollout_metadata_list)} != len(samples:{len(samples)}") - return False - - for sample, meta in zip(samples, rollout_metadata_list, strict=False): + for sample in samples: queue_sample = QueueSample( id=str(uuid.uuid4()), data=sample, param_version=param_version, timestamp=time.time(), - rollout_metadata=meta or {}, + rollout_metadata=rollout_metadata or {}, ) # 如果队列满了,移除最旧的样本,一般不会发生 @@ -235,7 +227,7 @@ def __init__(self, queue_actor: Any): self.queue_actor = queue_actor def put_samples( - self, samples: list[Any], param_version: int, rollout_metadata_list: list[dict[str, Any]] = None + self, samples: list[Any], param_version: int, rollout_metadata_list: list[dict[str, Any]] = None ) -> bool: """放入batch到队列""" return ray.get(self.queue_actor.put_samples.remote(samples, param_version, rollout_metadata_list)) diff --git a/recipe/fully_async_policy/run_fully_async_example.sh b/recipe/fully_async_policy/run_fully_async_example.sh index 180071318a1..cd2265cde0d 100644 --- a/recipe/fully_async_policy/run_fully_async_example.sh +++ b/recipe/fully_async_policy/run_fully_async_example.sh @@ -55,7 +55,6 @@ max_response_length=1024 # 异步训练参数 staleness_threshold=3 -max_staleness_allowed=5 max_queue_size=1000 min_batch_count=1 batch_timeout=30.0 @@ -121,7 +120,6 @@ python -m recipe.one_step_off_policy.fully_async_main \ \ # 异步训练配置 async_training.staleness_threshold=$staleness_threshold \ - async_training.max_staleness_allowed=$max_staleness_allowed \ async_training.max_queue_size=$max_queue_size \ async_training.min_batch_count=$min_batch_count \ async_training.batch_timeout=$batch_timeout \ From 352066c12fe9362f3bc974e60a474dd913b6d19c Mon Sep 17 00:00:00 2001 From: ArronHZG Date: Tue, 5 Aug 2025 17:48:54 +0800 Subject: [PATCH 023/182] gen data to queue --- tests/special_e2e/run_fully_async_policy.sh | 15 ++++----------- 1 file changed, 4 insertions(+), 11 deletions(-) diff --git a/tests/special_e2e/run_fully_async_policy.sh b/tests/special_e2e/run_fully_async_policy.sh index 9692aab0d44..2949316228a 100644 --- a/tests/special_e2e/run_fully_async_policy.sh +++ b/tests/special_e2e/run_fully_async_policy.sh @@ -33,7 +33,8 @@ overlong_penalty_factor=1.0 # Training parameters loss_agg_mode="token-mean" -train_prompt_bsz=8 +train_prompt_bsz=32 +gen_prompt_bsz=4 n_resp_per_prompt=3 train_prompt_mini_bsz=4 @@ -50,8 +51,6 @@ n_gpus_training=$((NUM_GPUS - n_gpus_rollout)) # Async training specific configurations staleness_threshold=3 -max_staleness_allowed=5 -max_queue_size=1000 min_batch_count=1 batch_timeout=30.0 generation_timeout=30.0 @@ -74,6 +73,7 @@ common_params=( data.max_prompt_length=${max_prompt_length} data.max_response_length=${max_response_length} data.train_batch_size=${train_prompt_bsz} + data.gen_batch_size=${gen_prompt_bsz} actor_rollout_ref.rollout.n=${n_resp_per_prompt} algorithm.adv_estimator=${adv_estimator} algorithm.use_kl_in_reward=${use_kl_in_reward} @@ -115,7 +115,7 @@ common_params=( trainer.test_freq=-1 trainer.save_freq=-1 trainer.total_epochs=2 - trainer.total_training_steps=4 + trainer.total_training_steps=10 trainer.resume_mode=disable trainer.nnodes=1 trainer.n_gpus_per_node=${n_gpus_training} @@ -123,13 +123,6 @@ common_params=( rollout.n_gpus_per_node=${n_gpus_rollout} # Fully async specific configurations async_training.staleness_threshold=${staleness_threshold} - async_training.max_staleness_allowed=${max_staleness_allowed} - async_training.max_queue_size=${max_queue_size} - async_training.min_batch_count=${min_batch_count} - async_training.batch_timeout=${batch_timeout} - async_training.generation_timeout=${generation_timeout} - async_training.batch_generation_interval=${batch_generation_interval} - async_training.max_sync_retries=${max_sync_retries} async_training.sync_timeout=${sync_timeout} async_training.sync_retry_delay=${sync_retry_delay} ) From 5fac1d8441cebc77ef5f353739e8abde69374edf Mon Sep 17 00:00:00 2001 From: ArronHZG Date: Tue, 5 Aug 2025 20:16:21 +0800 Subject: [PATCH 024/182] train get data --- .../README_async_trainer.md | 92 +++++ recipe/fully_async_policy/TEST_GUIDE.md | 312 +++++++++++++++++ .../fully_async_rollouter.py | 40 +-- .../fully_async_policy/fully_async_trainer.py | 210 ++++++++++-- recipe/fully_async_policy/message_queue.py | 14 +- recipe/fully_async_policy/run_benchmark.sh | 0 .../test_components_pytest.py | 315 ++++++++++++++++++ 7 files changed, 937 insertions(+), 46 deletions(-) create mode 100644 recipe/fully_async_policy/README_async_trainer.md create mode 100644 recipe/fully_async_policy/TEST_GUIDE.md mode change 100644 => 100755 recipe/fully_async_policy/run_benchmark.sh create mode 100644 recipe/fully_async_policy/test_components_pytest.py diff --git a/recipe/fully_async_policy/README_async_trainer.md b/recipe/fully_async_policy/README_async_trainer.md new file mode 100644 index 00000000000..9fbaa336be6 --- /dev/null +++ b/recipe/fully_async_policy/README_async_trainer.md @@ -0,0 +1,92 @@ +# FullyAsyncTrainer 队列数据获取实现 + +## 概述 + +本实现为 `FullyAsyncTrainer` 类添加了从消息队列获取样本并组成 `gen_batch_output` 的功能,实现了完全异步的训练流程。 + +## 核心功能 + +### 1. 样本计算逻辑 + +```python +# 计算需要获取的样本数量 +n_responses_per_prompt = self.config.actor_rollout_ref.rollout.n +batch_size = self.config.data.train_batch_size +required_samples = n_responses_per_prompt * batch_size +``` + +训练器会根据配置自动计算需要从队列获取的样本数量: +- `rollout.n`: 每个prompt生成的响应数量 +- `train_batch_size`: 训练批次大小 +- 总样本数 = n × batch_size + +### 2. 主要方法 + +#### `_get_samples_from_queue()` +- 从消息队列获取指定数量的样本 +- 组装成 `gen_batch_output` 格式 +- 提取原始batch信息构造 `batch_dict` + +#### `_assemble_gen_batch_output_from_queue_samples()` +- 将队列中的多个样本重新组装成 `DataProto` 对象 +- 处理tensor和non-tensor数据 +- 合并timing信息和metadata + +#### `_extract_batch_dict_from_sample()` +- 从样本数据中提取原始输入信息 +- 过滤掉生成的输出,保留prompt相关数据 + +#### `_async_get_next_batch_from_queue()` +- 异步获取下一批队列数据 +- 使用线程池实现非阻塞操作 + +### 3. 数据流程 + +1. **样本生成**: Rollouter生成样本并放入MessageQueue +2. **样本获取**: Trainer从队列异步获取 `n × batch_size` 个样本 +3. **数据重组**: 将队列样本重新组装成标准的 `gen_batch_output` 格式 +4. **训练处理**: 样本进入标准的PPO训练流程 + +### 4. 使用示例 + +```python +# 初始化trainer +trainer = FullyAsyncTrainer(config, tokenizer, role_worker_mapping, resource_pool_manager) + +# 设置消息队列客户端 +trainer.set_message_queue_client(message_queue_client) + +# 开始训练(自动从队列获取数据) +trainer.fit() +``` + +## 配置要求 + +确保配置中包含以下参数: + +```yaml +data: + train_batch_size: 128 # 训练批次大小 + +actor_rollout_ref: + rollout: + n: 4 # 每个prompt的响应数量 +``` + +## 特性 + +- **异步处理**: 使用异步方式从队列获取数据,不阻塞训练流程 +- **数据完整性**: 保持原有的tensor和non-tensor数据结构 +- **元数据保留**: 保留timing、参数版本等重要信息 +- **兼容性**: 与现有的PPO训练流程完全兼容 + +## 监控指标 + +训练器提供以下统计指标: +- `queue_sample_count`: 当前批次的样本数量 +- `rollout_param_versions`: 样本对应的参数版本 +- `sample_timestamps`: 样本生成时间戳 +- timing信息的平均值 + +通过 `trainer.get_statistics()` 可以获取详细的训练统计信息。 + diff --git a/recipe/fully_async_policy/TEST_GUIDE.md b/recipe/fully_async_policy/TEST_GUIDE.md new file mode 100644 index 00000000000..558920e5e84 --- /dev/null +++ b/recipe/fully_async_policy/TEST_GUIDE.md @@ -0,0 +1,312 @@ +# Fully Async Policy 测试指南 + +本文档介绍如何测试完全异步PPO训练系统的各种功能和性能。 + +## 📋 测试概览 + +我们提供了多种类型的测试,涵盖从单元测试到端到端测试的完整测试套件: + +### 测试类型 +1. **单元测试** - 测试各个组件的独立功能 +2. **集成测试** - 测试组件间的协作 +3. **端到端测试** - 测试完整的训练流程 +4. **性能基准测试** - 评估系统性能特征 +5. **压力测试** - 测试系统在极限条件下的表现 + +## 🚀 快速开始 + +### 1. 端到端测试 +最简单的方式是运行端到端测试,验证系统基本功能: + +```bash +# 基本E2E测试 +./run_e2e_test.sh + +# 使用环境变量自定义配置 +NUM_GPUS=4 MODEL_ID=Qwen/Qwen2.5-0.5B-Instruct ./run_e2e_test.sh +``` + +### 2. 单元测试 +运行组件级别的单元测试: + +```bash +# 运行所有单元测试 +cd unittest/ +python test_fully_async_components.py + +# 或者使用pytest(如果安装) +pytest test_components_pytest.py -v +``` + +### 3. 性能基准测试 +评估系统性能特征: + +```bash +# 运行完整的性能基准测试 +./run_benchmark.sh + +# 自定义GPU数量和策略 +NUM_GPUS=8 ACTOR_STRATEGY=fsdp2 ./run_benchmark.sh +``` + +## 📊 测试脚本详解 + +### run_e2e_test.sh +- **目的**: 端到端功能验证 +- **配置**: 最小化配置,快速验证基本功能 +- **时长**: 约5-10分钟 +- **用法**: `./run_e2e_test.sh` + +**环境变量**: +- `NUM_GPUS`: GPU数量 (默认: 4) +- `MODEL_ID`: 使用的模型ID (默认: Qwen/Qwen2.5-0.5B-Instruct) +- `MODEL_PATH`: 模型存储路径 + +### run_benchmark.sh +- **目的**: 性能基准测试 +- **配置**: 多种配置组合,评估性能影响 +- **时长**: 约30-60分钟 +- **用法**: `./run_benchmark.sh` + +**测试覆盖**: +1. 不同新鲜度阈值的影响 +2. 不同队列大小的性能表现 +3. 生成间隔对吞吐量的影响 +4. GPU资源分配的优化 +5. 暂停/恢复功能测试 + +### test_fully_async_components.py +- **目的**: 单元和集成测试 +- **配置**: 使用Mock对象的孤立测试 +- **时长**: 约2-5分钟 +- **用法**: `python unittest/test_fully_async_components.py` + +**测试覆盖**: +- MessageQueue的基本功能 +- 参数同步器的重试机制 +- Rollouter的暂停/恢复 +- 新鲜度指标计算 +- 错误处理和超时机制 + +## 🔧 测试配置 + +### 最小化测试配置 +用于快速验证功能: + +```yaml +# 基本配置 +data: + train_batch_size: 4 + max_prompt_length: 512 + max_response_length: 1024 + +trainer: + total_training_steps: 2 + n_gpus_per_node: 2 + +rollout: + n_gpus_per_node: 2 + +async_training: + staleness_threshold: 3 + max_queue_size: 100 +``` + +### 性能测试配置 +用于评估系统性能: + +```yaml +# 性能配置 +data: + train_batch_size: 16 + max_prompt_length: 512 + max_response_length: 1024 + +trainer: + total_training_steps: 10 + n_gpus_per_node: 6 + +rollout: + n_gpus_per_node: 2 + +async_training: + staleness_threshold: 3 + max_queue_size: 1000 + generation_timeout: 30.0 +``` + +## 📈 测试结果分析 + +### 成功指标 +测试成功应满足以下条件: + +1. **功能正确性**: + - 样本成功生成和消费 + - 参数同步正常工作 + - 暂停/恢复功能响应 + +2. **性能表现**: + - 样本生成速率 > 目标吞吐量 + - 队列利用率在合理范围(50-80%) + - 新鲜度指标符合预期 + +3. **稳定性**: + - 无内存泄漏 + - 无死锁或竞争条件 + - 优雅处理错误情况 + +### 失败排查 +常见问题及解决方案: + +1. **Ray连接失败**: + ```bash + # 重新初始化Ray + ray stop + ray start --head + ``` + +2. **GPU内存不足**: + ```bash + # 减少批大小或使用梯度检查点 + data.train_batch_size=2 + actor_rollout_ref.model.enable_gradient_checkpointing=True + ``` + +3. **队列阻塞**: + ```bash + # 调整队列大小和新鲜度阈值 + async_training.max_queue_size=500 + async_training.staleness_threshold=5 + ``` + +## 🎯 特定功能测试 + +### 测试暂停/恢复功能 +```python +# 在Python脚本中测试 +import ray +from fully_async_rollouter import FullyAsyncRollouter + +rollouter = FullyAsyncRollouter.remote(config, ...) + +# 测试暂停 +result = ray.get(rollouter.pause_rollout.remote()) +assert result == True + +# 测试恢复 +result = ray.get(rollouter.resume_rollout.remote()) +assert result == True +``` + +### 测试新鲜度控制 +```python +# 测试样本过期机制 +queue = MessageQueueClient.remote(max_staleness=3) + +# 放入旧版本样本 +queue.put_samples.remote(sample, param_version=1) + +# 用新版本获取(应该被拒绝) +result = ray.get(queue.get_samples.remote(current_param_version=5)) +assert result is None +``` + +### 测试参数同步 +```python +# 测试同步重试机制 +sync = ParameterSynchronizer.remote(config, actor_wg, rollout_wg) + +# 测试成功同步 +result = ray.get(sync.sync_weights.remote()) +assert result == True +``` + +## 📝 测试报告 + +### 基准测试报告 +运行`./run_benchmark.sh`后,会在`benchmark_results_*/`目录下生成: + +- `performance_report.md` - 详细的性能报告 +- `summary.txt` - 关键指标摘要 +- `*.log` - 各项测试的详细日志 + +### 关键指标 +需要关注的性能指标: + +1. **吞吐量指标**: + - 样本生成速率 (samples/second) + - 训练步数完成速率 (steps/second) + +2. **延迟指标**: + - 样本平均年龄 (average sample age) + - 参数同步延迟 (sync latency) + +3. **资源利用率**: + - GPU利用率 (GPU utilization) + - 内存使用量 (memory usage) + - 队列利用率 (queue utilization) + +## 🔄 CI/CD 集成 + +### GitHub Actions 示例 +```yaml +name: Fully Async Policy Tests +on: [push, pull_request] + +jobs: + test: + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v2 + - name: Setup Python + uses: actions/setup-python@v2 + with: + python-version: 3.9 + + - name: Install dependencies + run: | + pip install -r requirements.txt + pip install pytest + + - name: Run unit tests + run: | + cd recipe/fully_async_policy/unittest/ + python test_fully_async_components.py + + - name: Run E2E test (if GPUs available) + run: | + if nvidia-smi; then + cd recipe/fully_async_policy/ + ./run_e2e_test.sh + fi +``` + +## 🛠️ 开发者测试 + +### 添加新测试 +1. **单元测试**: 在`unittest/test_fully_async_components.py`中添加新的测试类 +2. **集成测试**: 在相应的集成测试类中添加新方法 +3. **性能测试**: 在`run_benchmark.sh`中添加新的基准测试场景 + +### 测试最佳实践 +1. **隔离性**: 每个测试应该独立,不依赖其他测试 +2. **可重现性**: 使用固定的随机种子和确定性配置 +3. **清理**: 测试结束后清理资源,避免影响后续测试 +4. **文档**: 为新测试添加清晰的文档说明 + +## ❓ 常见问题 + +**Q: 测试失败,提示Ray连接错误** +A: 确保Ray集群正常运行,或重新启动Ray + +**Q: 内存不足错误** +A: 减少批大小或在测试配置中启用参数卸载 + +**Q: 测试运行时间过长** +A: 使用更小的模型或减少训练步数进行快速测试 + +**Q: 如何添加自定义测试?** +A: 参考现有测试模式,在对应的测试文件中添加新的测试方法 + +通过这套完整的测试系统,可以确保fully async policy系统的可靠性、性能和稳定性。 + diff --git a/recipe/fully_async_policy/fully_async_rollouter.py b/recipe/fully_async_policy/fully_async_rollouter.py index 9196dc08e94..57d3eed243a 100644 --- a/recipe/fully_async_policy/fully_async_rollouter.py +++ b/recipe/fully_async_policy/fully_async_rollouter.py @@ -18,7 +18,6 @@ import ray from omegaconf import OmegaConf -from tqdm import tqdm from recipe.fully_async_policy.message_queue import MessageQueueClient from verl import DataProto @@ -36,16 +35,16 @@ class FullyAsyncRollouter(RayPPOTrainer): """ def __init__( - self, - config, - tokenizer, - role_worker_mapping: dict[Role, WorkerType], - resource_pool_manager: ResourcePoolManager, - ray_worker_group_cls: RayWorkerGroup = RayWorkerGroup, - processor=None, - reward_fn=None, - val_reward_fn=None, - device_name=None, + self, + config, + tokenizer, + role_worker_mapping: dict[Role, WorkerType], + resource_pool_manager: ResourcePoolManager, + ray_worker_group_cls: RayWorkerGroup = RayWorkerGroup, + processor=None, + reward_fn=None, + val_reward_fn=None, + device_name=None, ): """ Initialize distributed PPO trainer with Ray backend. @@ -144,7 +143,9 @@ def __init__( self.sync_in_progress = False self.sync_lock = threading.Lock() - self.max_queue_size = self.staleness_threshold * self.config.data.train_batch_size * self.config.actor_rollout_ref.rollout.n + self.max_queue_size = ( + self.staleness_threshold * self.config.data.train_batch_size * self.config.actor_rollout_ref.rollout.n + ) def set_message_queue_client(self, message_queue_client: MessageQueueClient): """设置消息队列客户端""" @@ -287,7 +288,6 @@ def _generation_loop(self): if not self.running: break - metrics = {} timing_raw = {} batch, gen_batch = self._prepare_generate_batch(batch_dict) is_last_step = self.global_steps >= self.total_training_steps @@ -328,10 +328,12 @@ def _generation_loop(self): self.dropped_stale_samples += 1 if self.global_steps % 1 == 0: - print(f"Generated {self.total_generated_samples} batches, \n" - f"param_version={self.current_param_version}, \n" - f"errors={self.generation_errors}, \n" - f"Dropped stale samples: {self.dropped_stale_samples}\n") + print( + f"Generated {self.total_generated_samples} batches, \n" + f"param_version={self.current_param_version}, \n" + f"errors={self.generation_errors}, \n" + f"Dropped stale samples: {self.dropped_stale_samples}\n" + ) self.global_steps += 1 @@ -424,8 +426,8 @@ def _should_pause_generation(self) -> bool: # 如果队列太满,也暂停生成 - if queue_size >= max_queue_size: - print(f"Should pause due to full queue: size={queue_size}, max={max_queue_size}") + if queue_size >= self.max_queue_size: + print(f"Should pause due to full queue: size={queue_size}, max={self.max_queue_size}") return True return False diff --git a/recipe/fully_async_policy/fully_async_trainer.py b/recipe/fully_async_policy/fully_async_trainer.py index db6bdfeaebc..5db63c9fab9 100644 --- a/recipe/fully_async_policy/fully_async_trainer.py +++ b/recipe/fully_async_policy/fully_async_trainer.py @@ -13,9 +13,11 @@ # limitations under the License. import logging +import threading import time import warnings from pprint import pprint +from typing import Any import numpy as np import ray @@ -33,7 +35,6 @@ WorkerType, ) from verl.utils.debug import marked_timer -from verl.utils.tracking import ValidationGenerationsLogger logger = logging.getLogger(__name__) @@ -115,11 +116,192 @@ def __init__( self.use_critic = False self._validate_config() + + self.lock = threading.RLock() self.message_queue_client = None + self.param_synchronizer = None def set_message_queue_client(self, message_queue_client: MessageQueueClient): """设置消息队列客户端""" - self.message_queue_client = message_queue_client + with self.lock: + self.message_queue_client = message_queue_client + + def set_parameter_synchronizer(self, param_synchronizer): + """设置参数同步器""" + with self.lock: + self.param_synchronizer = param_synchronizer + + def _get_samples_from_queue(self) -> tuple[None, None, None] | tuple[int, dict, Any]: + """ + 从消息队列获取样本并组成gen_batch_output + + Returns: + tuple: (epoch, batch_dict, gen_batch_output) + """ + if self.message_queue_client is None: + raise ValueError("MessageQueue client not set. Call set_message_queue_client() first.") + + # 计算需要获取的样本数量 + n_responses_per_prompt = self.config.actor_rollout_ref.rollout.n + batch_size = self.config.data.train_batch_size + required_samples = n_responses_per_prompt * batch_size + + logger.info( + f"Requesting {required_samples} samples from queue (n={n_responses_per_prompt}, batch_size={batch_size})" + ) + + # 从队列获取样本 + queue_samples = self.message_queue_client.get_samples(min_batch_count=required_samples) + + if not queue_samples or len(queue_samples) == 0: + logger.warning("required_samples is empty") + return None, None, None + + logger.info(f"Retrieved {len(queue_samples)} samples from queue") + + # 组装gen_batch_output + gen_batch_output = self._assemble_gen_batch_output_from_queue_samples( + queue_samples, n_responses_per_prompt, batch_size + ) + + # 从第一个样本中提取原始batch信息来构造batch_dict + first_sample = queue_samples[0].data + batch_dict = self._extract_batch_dict_from_sample(first_sample, batch_size) + + return 0, batch_dict, gen_batch_output + + def _assemble_gen_batch_output_from_queue_samples( + self, queue_samples: list[QueueSample], n_responses_per_prompt: int, batch_size: int + ): + """ + 从队列样本中组装gen_batch_output + + Args: + queue_samples: 队列中的样本列表 + n_responses_per_prompt: 每个prompt的响应数量 + batch_size: 批次大小 + + Returns: + DataProto: 组装好的gen_batch_output + """ + import numpy as np + import torch + + from verl.protocol import DataProto + + # 提取所有样本的数据 + sample_data_list = [] + rollout_metadata_list = [] + timing_info = {} + + for sample in queue_samples: + sample_data_list.append(sample.data) + rollout_metadata_list.append(sample.rollout_metadata) + + # 假设所有样本具有相同的数据结构,从第一个样本推断结构 + first_sample_data = sample_data_list[0] + + # 组装tensor数据 + tensor_dict = {} + non_tensor_dict = {} + + # 获取第一个样本的结构来初始化 + if hasattr(first_sample_data, "batch") and first_sample_data.batch is not None: + # 处理tensor数据 + for key in first_sample_data.batch.keys(): + tensor_list = [] + for sample_data in sample_data_list: + if hasattr(sample_data, "batch") and sample_data.batch is not None and key in sample_data.batch: + tensor_list.append(sample_data.batch[key]) + else: + logger.warning(f"Missing key '{key}' in sample batch data") + + if tensor_list: + # 连接所有tensor + tensor_dict[key] = torch.cat(tensor_list, dim=0) + + if hasattr(first_sample_data, "non_tensor_batch") and first_sample_data.non_tensor_batch: + # 处理non_tensor数据 + for key in first_sample_data.non_tensor_batch.keys(): + non_tensor_list = [] + for sample_data in sample_data_list: + if ( + hasattr(sample_data, "non_tensor_batch") + and sample_data.non_tensor_batch + and key in sample_data.non_tensor_batch + ): + non_tensor_list.extend(sample_data.non_tensor_batch[key]) + else: + logger.warning(f"Missing key '{key}' in sample non_tensor_batch data") + + if non_tensor_list: + non_tensor_dict[key] = np.array(non_tensor_list, dtype=object) + + # 收集timing信息和metadata + for sample, metadata in zip(queue_samples, rollout_metadata_list, strict=False): + if "timing" in metadata: + for timing_key, timing_value in metadata["timing"].items(): + if timing_key not in timing_info: + timing_info[timing_key] = [] + timing_info[timing_key].append(timing_value) + + # 计算平均timing + avg_timing = {} + for key, values in timing_info.items(): + if values: + avg_timing[key] = sum(values) / len(values) + + # 创建meta_info + meta_info = { + "timing": avg_timing, + "queue_sample_count": len(queue_samples), + "rollout_param_versions": [sample.param_version for sample in queue_samples], + "sample_timestamps": [sample.timestamp for sample in queue_samples], + } + + # 创建DataProto对象 + if tensor_dict or non_tensor_dict: + gen_batch_output = DataProto.from_dict( + tensors=tensor_dict if tensor_dict else None, + non_tensors=non_tensor_dict if non_tensor_dict else None, + meta_info=meta_info, + ) + else: + # 如果没有数据,创建空的DataProto + logger.warning("No tensor or non_tensor data found in samples, creating empty DataProto") + gen_batch_output = DataProto.from_dict(meta_info=meta_info) + + logger.info(f"Assembled gen_batch_output with {len(gen_batch_output)} samples") + return gen_batch_output + + def _extract_batch_dict_from_sample(self, sample_data, batch_size: int) -> dict: + """ + 从样本数据中提取batch_dict信息 + + Args: + sample_data: 样本数据 + batch_size: 批次大小 + + Returns: + dict: batch字典 + """ + batch_dict = {} + + # 从样本中提取原始输入信息 + if hasattr(sample_data, "batch") and sample_data.batch is not None: + for key, value in sample_data.batch.items(): + # 只保留输入相关的key,去掉生成的输出 + if key in ["input_ids", "attention_mask", "position_ids"]: + # 由于我们有多个响应,需要取出原始prompt部分 + batch_dict[key] = value[:batch_size] if len(value) >= batch_size else value + + if hasattr(sample_data, "non_tensor_batch") and sample_data.non_tensor_batch: + for key, value in sample_data.non_tensor_batch.items(): + # 保留非tensor的批次数据 + if key in ["raw_prompt_ids", "raw_prompt", "multi_modal_data", "tools_kwargs", "interaction_kwargs"]: + batch_dict[key] = np.array(value[:batch_size]) if len(value) >= batch_size else np.array(value) + + return batch_dict def _create_actor_rollout_classes(self): # create actor @@ -156,7 +338,6 @@ def fit(self): to construct the PPO dataflow. The light-weight advantage computation is done on the driver process. """ - logger.info("Starting Trainer...") if self.message_queue_client is None: raise ValueError("MessageQueue client not set. Call set_message_queue_client() first.") @@ -193,13 +374,13 @@ def fit(self): last_val_metrics = None self.max_steps_duration = 0 - # across epoch iterator - continuous_iterator = self._create_continuous_iterator() - - # Start the first asynchronous generation task. - batch_data_future = self._async_gen_next_batch(continuous_iterator) + # 使用队列模式,不需要传统的dataloader迭代器 + # 初始化获取第一批数据 + while True: + epoch, batch, gen_batch_output = self._get_samples_from_queue() + if gen_batch_output is None: + break - while batch_data_future is not None: metrics = {} timing_raw = {} @@ -213,17 +394,6 @@ def fit(self): is_last_step = self.global_steps >= self.total_training_steps with marked_timer("step", timing_raw): - # wait for the previous batch - with marked_timer("wait_prev_gen", timing_raw, color="red"): - epoch, batch, gen_batch_output = batch_data_future.get() - timing_raw.update(gen_batch_output.meta_info["timing"]) - gen_batch_output.meta_info.pop("timing", None) - - # asys next generation (with syns weights from actor to rollout) - with marked_timer("sync_rollout_weights", timing_raw, color="purple"): - if not is_last_step: - batch_data_future = self._async_gen_next_batch(continuous_iterator) - batch = self._post_generate_batch(batch, gen_batch_output, metrics) batch, reward_extra_infos_dict = self._process_batch_common(batch, metrics, timing_raw) self._log_rollout(batch, reward_extra_infos_dict, timing_raw) diff --git a/recipe/fully_async_policy/message_queue.py b/recipe/fully_async_policy/message_queue.py index 61723cde953..b72b9482e09 100644 --- a/recipe/fully_async_policy/message_queue.py +++ b/recipe/fully_async_policy/message_queue.py @@ -76,7 +76,7 @@ def __init__(self, config: DictConfig, max_queue_size: int = 1000): ) def put_samples( - self, samples: list[Any] | Any, param_version: int, rollout_metadata: dict[str, Any] = None + self, samples: list[Any] | Any, param_version: int, rollout_metadata: dict[str, Any] = None ) -> bool: """ 放入一个batch样本到队列 @@ -123,26 +123,26 @@ def put_samples( return True - def get_samples(self, min_batch: int = 1) -> list[QueueSample]: + def get_samples(self, min_batch_count: int = 1) -> list[QueueSample]: """ 从队列获取batch样本,一直等待直到有足够样本 Args: - min_batch: sample数量满足min_batch,一次性获取 + min_batch_count: sample数量满足min_batch,一次性获取 Returns: List[QueueSample]: 获取的样本列表 """ with self.lock: - while len(self.queue) < min_batch and self.running: + while len(self.queue) < min_batch_count and self.running: self.consumer_condition.wait() # 如果队列已关闭且没有足够样本,返回空列表 - if not self.running and len(self.queue) < min_batch: + if not self.running and len(self.queue) < min_batch_count: return [] # 获取指定数量的样本 - batch_count = min(min_batch, len(self.queue)) + batch_count = min(min_batch_count, len(self.queue)) samples = [] for _ in range(batch_count): if self.queue: @@ -227,7 +227,7 @@ def __init__(self, queue_actor: Any): self.queue_actor = queue_actor def put_samples( - self, samples: list[Any], param_version: int, rollout_metadata_list: list[dict[str, Any]] = None + self, samples: list[Any], param_version: int, rollout_metadata_list: list[dict[str, Any]] = None ) -> bool: """放入batch到队列""" return ray.get(self.queue_actor.put_samples.remote(samples, param_version, rollout_metadata_list)) diff --git a/recipe/fully_async_policy/run_benchmark.sh b/recipe/fully_async_policy/run_benchmark.sh old mode 100644 new mode 100755 diff --git a/recipe/fully_async_policy/test_components_pytest.py b/recipe/fully_async_policy/test_components_pytest.py new file mode 100644 index 00000000000..d887e17fc12 --- /dev/null +++ b/recipe/fully_async_policy/test_components_pytest.py @@ -0,0 +1,315 @@ +# Copyright 2025 Meituan Ltd. and/or its affiliates +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" +Pytest测试文件,用于测试完全异步PPO训练系统的各个组件 +""" + +import time +from unittest.mock import Mock + +import pytest +import ray +from omegaconf import OmegaConf + + +@pytest.fixture +def ray_setup(): + """Ray初始化fixture""" + if not ray.is_initialized(): + ray.init(ignore_reinit_error=True, num_cpus=2) + yield + # 测试后不关闭Ray,因为其他测试可能还需要 + + +@pytest.fixture +def basic_config(): + """基本配置fixture""" + return OmegaConf.create( + { + "actor_rollout_ref": {"hybrid_engine": False, "model": {"lora_rank": 0}, "rollout": {"n": 2}}, + "algorithm": {"use_kl_in_reward": False}, + "critic": {"enable": False}, + "trainer": { + "device": "cpu", + "project_name": "test", + "experiment_name": "test", + "total_epochs": 1, + "total_training_steps": 2, + }, + "async_training": { + "staleness_threshold": 3, + "max_staleness_allowed": 5, + "generation_timeout": 10.0, + "batch_timeout": 5.0, + }, + "data": {"train_batch_size": 4}, + } + ) + + +class TestMessageQueue: + """测试MessageQueue功能""" + + def test_message_queue_creation(self, ray_setup): + """测试MessageQueue创建""" + try: + from message_queue import MessageQueueClient + + queue = MessageQueueClient.remote(max_queue_size=10, max_staleness=3) + + # 测试基本功能 + stats = ray.get(queue.get_statistics.remote()) + assert "queue_size" in stats + assert stats["queue_size"] == 0 + + ray.kill(queue) + + except ImportError: + pytest.skip("MessageQueue not available") + + def test_queue_put_get(self, ray_setup): + """测试队列的put/get操作""" + try: + from message_queue import MessageQueueClient + + queue = MessageQueueClient.remote(max_queue_size=10, max_staleness=3) + + # 创建模拟样本 + mock_sample = Mock() + mock_sample.batch_size = 4 + + # 测试放入样本 + success = ray.get( + queue.put_samples.remote( + epoch=1, sample=mock_sample, param_version=1, rollout_metadata={"timestamp": time.time()} + ) + ) + assert success + + # 测试获取样本 + result = ray.get(queue.get_samples.remote(min_batch_count=1, timeout=2.0, current_param_version=1)) + assert result is not None + + ray.kill(queue) + + except ImportError: + pytest.skip("MessageQueue not available") + + +class TestRollouter: + """测试Rollouter功能""" + + def test_rollouter_pause_resume(self, ray_setup, basic_config): + """测试Rollouter的暂停恢复功能""" + try: + from fully_async_rollouter import FullyAsyncRollouter + + # 创建模拟依赖 + mock_tokenizer = Mock() + mock_role_worker_mapping = {} + mock_resource_pool_manager = Mock() + + # 创建Rollouter + rollouter = FullyAsyncRollouter.remote( + config=basic_config, + tokenizer=mock_tokenizer, + role_worker_mapping=mock_role_worker_mapping, + resource_pool_manager=mock_resource_pool_manager, + ) + + # 测试暂停 + result = ray.get(rollouter.pause_rollout.remote()) + assert result is True + + # 检查状态 + is_paused = ray.get(rollouter.is_rollout_paused.remote()) + assert is_paused is True + + # 测试恢复 + result = ray.get(rollouter.resume_rollout.remote()) + assert result is True + + # 检查状态 + is_paused = ray.get(rollouter.is_rollout_paused.remote()) + assert is_paused is False + + ray.kill(rollouter) + + except ImportError: + pytest.skip("FullyAsyncRollouter not available") + + def test_rollouter_statistics(self, ray_setup, basic_config): + """测试Rollouter统计功能""" + try: + from fully_async_rollouter import FullyAsyncRollouter + + mock_tokenizer = Mock() + mock_role_worker_mapping = {} + mock_resource_pool_manager = Mock() + + rollouter = FullyAsyncRollouter.remote( + config=basic_config, + tokenizer=mock_tokenizer, + role_worker_mapping=mock_role_worker_mapping, + resource_pool_manager=mock_resource_pool_manager, + ) + + # 获取统计信息 + stats = ray.get(rollouter.get_statistics.remote()) + + # 验证必要字段存在 + required_fields = [ + "total_generated_samples", + "dropped_stale_samples", + "generation_errors", + "current_param_version", + "is_paused", + "pause_count", + ] + + for field in required_fields: + assert field in stats + + ray.kill(rollouter) + + except ImportError: + pytest.skip("FullyAsyncRollouter not available") + + +class TestTrainer: + """测试Trainer功能""" + + def test_trainer_creation(self, ray_setup, basic_config): + """测试Trainer创建""" + try: + from fully_async_trainer import FullyAsyncTrainer + + mock_tokenizer = Mock() + mock_role_worker_mapping = {} + mock_resource_pool_manager = Mock() + + trainer = FullyAsyncTrainer.remote( + config=basic_config, + tokenizer=mock_tokenizer, + role_worker_mapping=mock_role_worker_mapping, + resource_pool_manager=mock_resource_pool_manager, + ) + + # 基本验证 + assert trainer is not None + + ray.kill(trainer) + + except ImportError: + pytest.skip("FullyAsyncTrainer not available") + + +class TestParameterSync: + """测试参数同步功能""" + + def test_param_sync_creation(self, ray_setup): + """测试参数同步器创建""" + try: + from param_sync import ParameterSynchronizer + + config = OmegaConf.create( + {"async_training": {"max_sync_retries": 3, "sync_timeout": 10.0, "sync_retry_delay": 0.1}} + ) + + mock_actor_wg = Mock() + mock_rollout_wg = Mock() + + synchronizer = ParameterSynchronizer.remote( + config=config, actor_wg=mock_actor_wg, rollout_wg=mock_rollout_wg + ) + + assert synchronizer is not None + + ray.kill(synchronizer) + + except ImportError: + pytest.skip("ParameterSynchronizer not available") + + +class TestIntegration: + """集成测试""" + + def test_basic_workflow_simulation(self, ray_setup): + """测试基本工作流模拟""" + # 这是一个简化的集成测试,模拟基本的工作流 + try: + from message_queue import MessageQueueClient + + # 创建消息队列 + queue = MessageQueueClient.remote(max_queue_size=5, max_staleness=2) + + # 模拟生产者(Rollouter) + mock_sample = Mock() + mock_sample.batch_size = 2 + + # 放入样本 + success = ray.get( + queue.put_samples.remote( + epoch=1, sample=mock_sample, param_version=1, rollout_metadata={"timestamp": time.time()} + ) + ) + assert success + + # 模拟消费者(Trainer) + result = ray.get(queue.get_samples.remote(min_batch_count=1, timeout=2.0, current_param_version=1)) + assert result is not None + + samples, metadata_list = result + assert len(samples) == 1 + assert len(metadata_list) == 1 + + ray.kill(queue) + + except ImportError: + pytest.skip("Integration test components not available") + + +class TestErrorHandling: + """错误处理测试""" + + def test_timeout_handling(self, ray_setup): + """测试超时处理""" + try: + from message_queue import MessageQueueClient + + queue = MessageQueueClient.remote(max_queue_size=5, max_staleness=2) + + # 测试从空队列超时获取 + start_time = time.time() + result = ray.get( + queue.get_samples.remote( + min_batch_count=1, + timeout=1.0, # 1秒超时 + current_param_version=1, + ) + ) + elapsed = time.time() - start_time + + assert result is None + assert 0.9 <= elapsed <= 2.0 # 允许一些误差 + + ray.kill(queue) + + except ImportError: + pytest.skip("MessageQueue not available") + + +if __name__ == "__main__": + # 如果直接运行此文件,执行所有测试 + pytest.main([__file__, "-v"]) From 459aa7157c2abc71e53a69d357b0f52e5d0c8ccd Mon Sep 17 00:00:00 2001 From: ArronHZG Date: Wed, 6 Aug 2025 10:46:54 +0800 Subject: [PATCH 025/182] put data to queue --- .../fully_async_policy/README_fully_async.md | 28 +++ recipe/fully_async_policy/TEST_GUIDE.md | 3 +- recipe/fully_async_policy/fully_async_main.py | 11 +- .../fully_async_rollouter.py | 159 ++++++------------ recipe/fully_async_policy/message_queue.py | 40 ++--- .../test_components_pytest.py | 4 +- .../unittest/test_fully_async.py | 8 +- .../unittest/test_fully_async_components.py | 12 +- recipe/fully_async_policy/unittest/test_mq.py | 26 +-- 9 files changed, 127 insertions(+), 164 deletions(-) diff --git a/recipe/fully_async_policy/README_fully_async.md b/recipe/fully_async_policy/README_fully_async.md index 1708be5ae34..916633a4a81 100644 --- a/recipe/fully_async_policy/README_fully_async.md +++ b/recipe/fully_async_policy/README_fully_async.md @@ -306,3 +306,31 @@ def custom_monitor(trainer_stats, rollouter_stats): - 简单的消息队列实现 - 基本的参数同步功能 + +```python +DataProtoItem( + batch=TensorDict( + fields={ + attention_mask: Tensor(shape=torch.Size([3072]), device=cpu, dtype=torch.int64, is_shared=False), + input_ids: Tensor(shape=torch.Size([3072]), device=cpu, dtype=torch.int64, is_shared=False), + position_ids: Tensor(shape=torch.Size([3072]), device=cpu, dtype=torch.int64, is_shared=False), + prompts: Tensor(shape=torch.Size([1024]), device=cpu, dtype=torch.int64, is_shared=False), + response_mask: Tensor(shape=torch.Size([2048]), device=cpu, dtype=torch.int64, is_shared=False), + responses: Tensor(shape=torch.Size([2048]), device=cpu, dtype=torch.int64, is_shared=False)}, + batch_size=torch.Size([]), + device=None, + is_shared=False), + non_tensor_batch={'data_source': 'openai/gsm8k', + 'ability': 'math', + 'reward_model': {'ground_truth': '35', 'style': 'rule'}, + 'extra_info': { + 'answer': 'The total number of green and red plates is 28 + 21 = <<28+21=49>>49.\nXavier should buy 84 − 49 = 35 more plates.\n#### 35', + 'index': 1421, + 'question': 'Xavier needs 84 paper plates for a housewarming party. He already has 21 green plates and 28 red plates. How many more plates should Xavier buy?', 'split': 'train'}, + 'uid': 'fab3e910-67b3-4653-bc69-377250049267', + 'tools_kwargs': {}, + 'interaction_kwargs': {}, + 'index': 1421}, + meta_info={'global_token_num': [2141, 2141, 2161, 2151, 2151, 2130, 2141, 2161, 2161, 2151, 2130, 2130]}) +``` + diff --git a/recipe/fully_async_policy/TEST_GUIDE.md b/recipe/fully_async_policy/TEST_GUIDE.md index 558920e5e84..3933998cd84 100644 --- a/recipe/fully_async_policy/TEST_GUIDE.md +++ b/recipe/fully_async_policy/TEST_GUIDE.md @@ -199,12 +199,13 @@ assert result == True ``` ### 测试新鲜度控制 + ```python # 测试样本过期机制 queue = MessageQueueClient.remote(max_staleness=3) # 放入旧版本样本 -queue.put_samples.remote(sample, param_version=1) +queue.put_sample.remote(sample, param_version=1) # 用新版本获取(应该被拒绝) result = ray.get(queue.get_samples.remote(current_param_version=5)) diff --git a/recipe/fully_async_policy/fully_async_main.py b/recipe/fully_async_policy/fully_async_main.py index d7079d4af2b..c17b2d8dbdd 100644 --- a/recipe/fully_async_policy/fully_async_main.py +++ b/recipe/fully_async_policy/fully_async_main.py @@ -223,9 +223,13 @@ def _initialize_components(self, config) -> None: self.components["val_reward_fn"] = val_reward_fn # 创建MessageQueue + self.max_queue_size = ( + config.async_training.staleness_threshold + * config.data.train_batch_size + * config.actor_rollout_ref.rollout.n + ) print("Creating MessageQueue...") - max_queue_size = config.async_training.staleness_threshold * config.data.train_batch_size - message_queue = MessageQueue.remote(config, max_queue_size) + message_queue = MessageQueue.remote(config, self.max_queue_size) message_queue_client = MessageQueueClient(message_queue) self.components["message_queue"] = message_queue @@ -260,6 +264,7 @@ def _create_rollouter(self, config) -> None: ray_worker_group_cls=self.components["ray_worker_group_cls"], processor=self.components["processor"], device_name=config.trainer.device, + max_queue_size=self.max_queue_size, ) print(rollouter) @@ -311,6 +316,8 @@ def _run_training_loop(self): ray.get(rollouter_future) # ray.get(trainer_future) + self.components['message_queue_client'].clear_queue.remote() + print("Training completed or interrupted") def _monitor_components(self): diff --git a/recipe/fully_async_policy/fully_async_rollouter.py b/recipe/fully_async_policy/fully_async_rollouter.py index 57d3eed243a..61b21b43fd5 100644 --- a/recipe/fully_async_policy/fully_async_rollouter.py +++ b/recipe/fully_async_policy/fully_async_rollouter.py @@ -19,8 +19,7 @@ import ray from omegaconf import OmegaConf -from recipe.fully_async_policy.message_queue import MessageQueueClient -from verl import DataProto +from recipe.fully_async_policy.message_queue import MessageQueueClient, QueueSample from verl.single_controller.ray import RayClassWithInitArgs, RayWorkerGroup from verl.trainer.ppo.ray_trainer import RayPPOTrainer, ResourcePoolManager, Role, WorkerType from verl.utils.debug import marked_timer @@ -45,6 +44,7 @@ def __init__( reward_fn=None, val_reward_fn=None, device_name=None, + max_queue_size=1000, ): """ Initialize distributed PPO trainer with Ray backend. @@ -59,10 +59,6 @@ def __init__( processor: Optional data processor, used for multimodal data reward_fn: Function for computing rewards during training. val_reward_fn: Function for computing rewards during validation. - train_dataset (Optional[Dataset], optional): Training dataset. Defaults to None. - val_dataset (Optional[Dataset], optional): Validation dataset. Defaults to None. - collate_fn: Function to collate data samples into batches. - train_sampler (Optional[Sampler], optional): Sampler for the training dataset. Defaults to None. device_name (str, optional): Device name for training (e.g., "cuda", "cpu"). Defaults to None. """ # Store the tokenizer for text processing @@ -115,7 +111,6 @@ def __init__( # 统计信息 self.total_generated_samples = 0 self.dropped_stale_samples = 0 - self.generation_errors = 0 self.param_sync_requests = 0 # Worker groups @@ -143,9 +138,7 @@ def __init__( self.sync_in_progress = False self.sync_lock = threading.Lock() - self.max_queue_size = ( - self.staleness_threshold * self.config.data.train_batch_size * self.config.actor_rollout_ref.rollout.n - ) + self.max_queue_size = max_queue_size def set_message_queue_client(self, message_queue_client: MessageQueueClient): """设置消息队列客户端""" @@ -257,19 +250,6 @@ def _generation_loop(self): last_val_metrics = None self.max_steps_duration = 0 - """ - 主要的生成循环 - - 循环入口,需要 - 1. running 判断 - 4. 中断判断 - 3. 新鲜度判断 - - 生成样本过程中,需要 - 1. running 判断 - 2. 中断判断 - """ - continuous_iterator = self._create_continuous_iterator() for epoch, batch_dict in continuous_iterator: with self.lock: @@ -288,6 +268,7 @@ def _generation_loop(self): if not self.running: break + metrics = {} timing_raw = {} batch, gen_batch = self._prepare_generate_batch(batch_dict) is_last_step = self.global_steps >= self.total_training_steps @@ -308,101 +289,65 @@ def _generation_loop(self): "generation_timestamp": time.time(), "rollout_param_version": self.current_param_version, } + batch = self._post_generate_batch(batch, gen_batch_output, metrics) - gen_batch_output: DataProto = gen_batch_output - print(gen_batch_output) - for i in gen_batch_output: - print(i) - - # 放入队列 - success = self.message_queue_client.put_samples( - samples=gen_batch_output, - param_version=self.current_param_version, - rollout_metadata_list=rollout_metadata, - ) - print(f"put samples {success}") - with self.lock: - if success: - self.total_generated_samples += 1 - else: - self.dropped_stale_samples += 1 - - if self.global_steps % 1 == 0: - print( - f"Generated {self.total_generated_samples} batches, \n" - f"param_version={self.current_param_version}, \n" - f"errors={self.generation_errors}, \n" - f"Dropped stale samples: {self.dropped_stale_samples}\n" + for sample in batch: + # for sample in samples: + queue_sample = QueueSample( + data=sample, + rollout_metadata=rollout_metadata, + ) + # 放入队列 + success = self.message_queue_client.put_sample( + sample=ray.cloudpickle.dumps(queue_sample), + param_version=self.current_param_version, ) + print(f"put samples {success}") + with self.lock: + if success: + self.total_generated_samples += 1 + else: + self.dropped_stale_samples += 1 + + if self.global_steps % 1 == 0: + print( + f"Generated {self.total_generated_samples} batches, \n" + f"param_version={self.current_param_version}, \n" + f"Dropped stale samples: {self.dropped_stale_samples}\n" + ) self.global_steps += 1 if is_last_step: pprint(f"Final validation metrics: {last_val_metrics}") - return + break + + with self.lock: + self.running = False def _monitor_loop(self): """监控线程 - 监控状态并处理控制信号""" - try: - # 主线程保持运行,处理控制信号和状态监控 - last_stats_time = time.time() - stats_interval = 30.0 # 30秒报告一次统计 - check_interval = 5.0 # 5秒检查一次状态 - - while True: + # 主线程保持运行,处理控制信号和状态监控 + last_stats_time = time.time() + stats_interval = 30.0 # 30秒报告一次统计 + check_interval = 5.0 # 5秒检查一次状态 + while True: + with self.lock: + if not self.running: + break + time.sleep(check_interval) + # 定期打印统计信息 + current_time = time.time() + if current_time - last_stats_time >= stats_interval: + print(self.get_statistics()) + last_stats_time = current_time + # 检查是否应该恢复生成 + if not self._should_pause_generation(): with self.lock: - if not self.running: - break - - time.sleep(check_interval) - - # 定期打印统计信息 - current_time = time.time() - if current_time - last_stats_time >= stats_interval: - self._log_statistics() - last_stats_time = current_time - - # 检查是否应该恢复生成 - if not self._should_pause_generation(): - with self.lock: - if self.paused: - self.paused = False - self.condition.notify_all() - print("Generation resumed") - - except Exception as e: - print(f"Error in monitor loop: {e}") - finally: - print("Monitor thread exiting") - - def _report_loop(self): - try: - # 主线程保持运行,处理控制信号和状态监控 - last_stats_time = time.time() - stats_interval = 10.0 - - while self.running: - time.sleep(1.0) - - # 定期打印统计信息 - current_time = time.time() - if current_time - last_stats_time >= stats_interval: - self.get_statistics() - last_stats_time = current_time - if not self._should_pause_generation(): - self.resume() - - # 检查生成线程状态 - if not self.generation_thread.is_alive(): - print("Generation thread died, restarting...") - raise RuntimeError("generation_thread not alive") - - except KeyboardInterrupt: - print("Received interrupt signal, shutting down...") - except Exception as e: - print(f"Error in main loop: {e}") - finally: - self.shutdown() + if self.paused: + self.paused = False + self.condition.notify_all() + print("Generation resumed") def _should_pause_generation(self) -> bool: """ diff --git a/recipe/fully_async_policy/message_queue.py b/recipe/fully_async_policy/message_queue.py index b72b9482e09..f4dcd1f522d 100644 --- a/recipe/fully_async_policy/message_queue.py +++ b/recipe/fully_async_policy/message_queue.py @@ -30,10 +30,7 @@ class QueueSample: """单个batch样本,包含参数版本和新鲜度信息""" - id: str data: Any - param_version: int - timestamp: float rollout_metadata: dict[str, Any] @@ -75,16 +72,13 @@ def __init__(self, config: DictConfig, max_queue_size: int = 1000): "staleness_threshold={self.staleness_threshold}" ) - def put_samples( - self, samples: list[Any] | Any, param_version: int, rollout_metadata: dict[str, Any] = None - ) -> bool: + def put_sample(self, sample: Any, param_version: int) -> bool: """ 放入一个batch样本到队列 Args: - samples: 样本数据 + sample: 样本数据 param_version: 参数版本号 - rollout_metadata: rollout相关的元数据 Returns: bool: 是否成功放入队列 @@ -97,23 +91,13 @@ def put_samples( logger.debug(f"Dropped stale sample: staleness={staleness}, threshold={self.staleness_threshold}") return False - for sample in samples: - queue_sample = QueueSample( - id=str(uuid.uuid4()), - data=sample, - param_version=param_version, - timestamp=time.time(), - rollout_metadata=rollout_metadata or {}, - ) - - # 如果队列满了,移除最旧的样本,一般不会发生 - if len(self.queue) >= self.max_queue_size: - removed = self.queue.popleft() - self.dropped_samples += 1 - logger.warning(f"Queue full, dropped sample {removed.id}") - - self.queue.append(queue_sample) - self.total_produced += 1 + # 如果队列满了,移除最旧的样本,一般不会发生 + if len(self.queue) >= self.max_queue_size: + removed = self.queue.popleft() + self.dropped_samples += 1 + logger.warning(f"Queue full, dropped sample {removed.id}") + self.queue.append(sample) + self.total_produced += 1 # 通知等待的消费者 self.consumer_condition.notify() @@ -226,11 +210,9 @@ class MessageQueueClient: def __init__(self, queue_actor: Any): self.queue_actor = queue_actor - def put_samples( - self, samples: list[Any], param_version: int, rollout_metadata_list: list[dict[str, Any]] = None - ) -> bool: + def put_sample(self, sample: Any, param_version: int) -> bool: """放入batch到队列""" - return ray.get(self.queue_actor.put_samples.remote(samples, param_version, rollout_metadata_list)) + return ray.get(self.queue_actor.put_sample.remote(sample, param_version)) def get_samples(self, min_batch_count: int = 1) -> list[QueueSample]: """从队列获取batch,一直等待直到有足够样本""" diff --git a/recipe/fully_async_policy/test_components_pytest.py b/recipe/fully_async_policy/test_components_pytest.py index d887e17fc12..fd2e207cbe4 100644 --- a/recipe/fully_async_policy/test_components_pytest.py +++ b/recipe/fully_async_policy/test_components_pytest.py @@ -91,7 +91,7 @@ def test_queue_put_get(self, ray_setup): # 测试放入样本 success = ray.get( - queue.put_samples.remote( + queue.put_sample.remote( epoch=1, sample=mock_sample, param_version=1, rollout_metadata={"timestamp": time.time()} ) ) @@ -260,7 +260,7 @@ def test_basic_workflow_simulation(self, ray_setup): # 放入样本 success = ray.get( - queue.put_samples.remote( + queue.put_sample.remote( epoch=1, sample=mock_sample, param_version=1, rollout_metadata={"timestamp": time.time()} ) ) diff --git a/recipe/fully_async_policy/unittest/test_fully_async.py b/recipe/fully_async_policy/unittest/test_fully_async.py index a6646b17575..126ff489bf2 100644 --- a/recipe/fully_async_policy/unittest/test_fully_async.py +++ b/recipe/fully_async_policy/unittest/test_fully_async.py @@ -61,7 +61,7 @@ def test_basic_put_get(self): mock_batch = Mock(spec=DataProto) # 放入样本 - success = self.client.put_samples(samples=mock_batch, param_version=1, rollout_metadata={"test": "data"}) + success = self.client.put_sample(sample=mock_batch, param_version=1, rollout_metadata={"test": "data"}) self.assertTrue(success) # 获取样本 @@ -78,8 +78,8 @@ def test_freshness_control(self): self.client.update_param_version(10) # 尝试放入过期样本 - success = self.client.put_samples( - samples=mock_batch, + success = self.client.put_sample( + sample=mock_batch, param_version=5, # 版本差异为5,超过阈值3 rollout_metadata={}, ) @@ -159,7 +159,7 @@ def test_integration(): # 生产样本 for i in range(5): - success = client.put_samples(samples=mock_batch, param_version=i, rollout_metadata={"batch_id": i}) + success = client.put_sample(sample=mock_batch, param_version=i, rollout_metadata={"batch_id": i}) assert success, f"Failed to put batch {i}" # 消费样本 diff --git a/recipe/fully_async_policy/unittest/test_fully_async_components.py b/recipe/fully_async_policy/unittest/test_fully_async_components.py index 8e5279b84bb..8a5bc85d562 100644 --- a/recipe/fully_async_policy/unittest/test_fully_async_components.py +++ b/recipe/fully_async_policy/unittest/test_fully_async_components.py @@ -58,7 +58,7 @@ def test_put_and_get_samples(self): # 测试放入样本 success = ray.get( - self.message_queue.put_samples.remote( + self.message_queue.put_sample.remote( epoch=1, sample=mock_sample, param_version=1, rollout_metadata={"timestamp": time.time()} ) ) @@ -79,7 +79,7 @@ def test_staleness_control(self): # 放入一个参数版本较老的样本 success = ray.get( - self.message_queue.put_samples.remote( + self.message_queue.put_sample.remote( epoch=1, sample=mock_sample, param_version=1, rollout_metadata={"timestamp": time.time()} ) ) @@ -109,7 +109,7 @@ def test_queue_statistics(self): for i in range(3): ray.get( - self.message_queue.put_samples.remote( + self.message_queue.put_sample.remote( epoch=1, sample=mock_sample, param_version=1, rollout_metadata={"timestamp": time.time()} ) ) @@ -339,7 +339,7 @@ def test_message_queue_trainer_integration(self): mock_sample.batch_size = 4 ray.get( - message_queue.put_samples.remote( + message_queue.put_sample.remote( epoch=1, sample=mock_sample, param_version=1, rollout_metadata={"timestamp": time.time()} ) ) @@ -377,7 +377,7 @@ def test_message_queue_overflow(self): # 填满队列 for i in range(2): result = ray.get( - message_queue.put_samples.remote( + message_queue.put_sample.remote( epoch=1, sample=mock_sample, param_version=1, rollout_metadata={"timestamp": time.time()} ) ) @@ -385,7 +385,7 @@ def test_message_queue_overflow(self): # 尝试再放入一个样本(应该失败或者覆盖旧样本) result = ray.get( - message_queue.put_samples.remote( + message_queue.put_sample.remote( epoch=1, sample=mock_sample, param_version=1, rollout_metadata={"timestamp": time.time()} ) ) diff --git a/recipe/fully_async_policy/unittest/test_mq.py b/recipe/fully_async_policy/unittest/test_mq.py index 02e9839bcfd..2fff49d6576 100644 --- a/recipe/fully_async_policy/unittest/test_mq.py +++ b/recipe/fully_async_policy/unittest/test_mq.py @@ -66,7 +66,7 @@ def test_put_samples_success(self, message_queue_client, mock_data_proto): samples = [mock_data_proto, mock_data_proto] metadata_list = [{"test": "data1"}, {"test": "data2"}] - result = message_queue_client.put_samples(samples=samples, param_version=1, rollout_metadata_list=metadata_list) + result = message_queue_client.put_sample(sample=samples, param_version=1, rollout_metadata=metadata_list) assert result is True @@ -83,7 +83,7 @@ def test_put_samples_without_metadata(self, message_queue_client, mock_data_prot """测试不提供metadata时的处理""" samples = [mock_data_proto, mock_data_proto] - result = message_queue_client.put_samples(samples=samples, param_version=1, rollout_metadata_list=None) + result = message_queue_client.put_sample(sample=samples, param_version=1, rollout_metadata=None) assert result is True queue_size = message_queue_client.get_queue_size() @@ -94,7 +94,7 @@ def test_put_samples_metadata_mismatch(self, message_queue_client, mock_data_pro samples = [mock_data_proto, mock_data_proto] metadata_list = [{"test": "data1"}] # 长度不匹配 - result = message_queue_client.put_samples(samples=samples, param_version=1, rollout_metadata_list=metadata_list) + result = message_queue_client.put_sample(sample=samples, param_version=1, rollout_metadata=metadata_list) assert result is False # 应该失败 queue_size = message_queue_client.get_queue_size() @@ -107,10 +107,10 @@ def test_put_samples_staleness_check(self, message_queue_client, mock_data_proto # 尝试放入版本过旧的batch(版本差异>=3会被拒绝) samples = [mock_data_proto] - result = message_queue_client.put_samples( - samples=samples, + result = message_queue_client.put_sample( + sample=samples, param_version=2, # 5-2=3, 达到阈值 - rollout_metadata_list=None, + rollout_metadata=None, ) assert result is False @@ -124,7 +124,7 @@ def test_put_samples_queue_overflow(self, message_queue_client, mock_data_proto) # 填满队列(最大容量10) for i in range(6): # 每次放入2个,总共12个,超过最大容量10 samples = [mock_data_proto, mock_data_proto] - message_queue_client.put_samples(samples=samples, param_version=1, rollout_metadata_list=None) + message_queue_client.put_sample(sample=samples, param_version=1, rollout_metadata=None) # 队列大小应该保持在最大值 queue_size = message_queue_client.get_queue_size() @@ -139,7 +139,7 @@ def test_get_samples_success(self, message_queue_client, mock_data_proto): # 先放入一些samples samples = [mock_data_proto, mock_data_proto, mock_data_proto] metadata_list = [{"index": 0}, {"index": 1}, {"index": 2}] - message_queue_client.put_samples(samples=samples, param_version=1, rollout_metadata_list=metadata_list) + message_queue_client.put_sample(sample=samples, param_version=1, rollout_metadata=metadata_list) # 获取2个samples retrieved_samples = message_queue_client.get_samples(min_batch_count=2) @@ -168,7 +168,7 @@ def get_samples(): def put_samples_later(): time.sleep(0.5) # 延迟放入 samples = [mock_data_proto, mock_data_proto] - message_queue_client.put_samples(samples=samples, param_version=1, rollout_metadata_list=None) + message_queue_client.put_sample(sample=samples, param_version=1, rollout_metadata=None) # 启动消费者线程 consumer_thread = threading.Thread(target=get_samples) @@ -194,7 +194,7 @@ def test_clear_queue(self, message_queue_client, mock_data_proto): """测试清空队列""" # 先添加一些样本 samples = [mock_data_proto, mock_data_proto, mock_data_proto] - message_queue_client.put_samples(samples=samples, param_version=1, rollout_metadata_list=None) + message_queue_client.put_sample(sample=samples, param_version=1, rollout_metadata=None) # 清空队列 message_queue_client.clear_queue() @@ -208,7 +208,7 @@ def test_get_queue_size(self, message_queue_client, mock_data_proto): assert message_queue_client.get_queue_size() == 0 samples = [mock_data_proto] - message_queue_client.put_samples(samples=samples, param_version=1, rollout_metadata_list=None) + message_queue_client.put_sample(sample=samples, param_version=1, rollout_metadata=None) assert message_queue_client.get_queue_size() == 1 def test_get_statistics(self, message_queue_client): @@ -233,7 +233,7 @@ def test_get_memory_usage(self, message_queue_client, mock_data_proto): """测试获取内存使用统计""" # 添加一些样本 samples = [mock_data_proto, mock_data_proto] - message_queue_client.put_samples(samples=samples, param_version=1, rollout_metadata_list=None) + message_queue_client.put_sample(sample=samples, param_version=1, rollout_metadata=None) memory_stats = message_queue_client.get_memory_usage() @@ -282,7 +282,7 @@ def test_concurrent_put_get(self, mock_data_proto): def producer(): for i in range(50): samples = [mock_data_proto, mock_data_proto] - result = client.put_samples(samples=samples, param_version=1, rollout_metadata_list=None) + result = client.put_sample(sample=samples, param_version=1, rollout_metadata=None) results.append(("put", result)) time.sleep(0.1) From c65b6279b6b16c72109d60bb36ad8724fc42e906 Mon Sep 17 00:00:00 2001 From: ArronHZG Date: Wed, 6 Aug 2025 14:23:35 +0800 Subject: [PATCH 026/182] merge data proto item --- recipe/fully_async_policy/fully_async_main.py | 4 +- .../fully_async_policy/fully_async_trainer.py | 265 +++++----- recipe/fully_async_policy/message_queue.py | 2 - .../unittest/protocol_examples.py | 202 ++++++++ .../unittest/test_protocol_split_merge.py | 466 ++++++++++++++++++ verl/protocol.py | 166 ++++++- 6 files changed, 979 insertions(+), 126 deletions(-) create mode 100644 recipe/fully_async_policy/unittest/protocol_examples.py create mode 100644 recipe/fully_async_policy/unittest/test_protocol_split_merge.py diff --git a/recipe/fully_async_policy/fully_async_main.py b/recipe/fully_async_policy/fully_async_main.py index c17b2d8dbdd..404ffba4874 100644 --- a/recipe/fully_async_policy/fully_async_main.py +++ b/recipe/fully_async_policy/fully_async_main.py @@ -316,8 +316,8 @@ def _run_training_loop(self): ray.get(rollouter_future) # ray.get(trainer_future) - self.components['message_queue_client'].clear_queue.remote() - + self.components["message_queue_client"].clear_queue() + print("Training completed or interrupted") def _monitor_components(self): diff --git a/recipe/fully_async_policy/fully_async_trainer.py b/recipe/fully_async_policy/fully_async_trainer.py index 5db63c9fab9..5d69e9091ba 100644 --- a/recipe/fully_async_policy/fully_async_trainer.py +++ b/recipe/fully_async_policy/fully_async_trainer.py @@ -121,6 +121,12 @@ def __init__( self.message_queue_client = None self.param_synchronizer = None + # 统计信息 + self.processed_samples = 0 + self.stale_samples_processed = 0 + self.current_param_version = 0 + self.param_sync_count = 0 + def set_message_queue_client(self, message_queue_client: MessageQueueClient): """设置消息队列客户端""" with self.lock: @@ -131,7 +137,7 @@ def set_parameter_synchronizer(self, param_synchronizer): with self.lock: self.param_synchronizer = param_synchronizer - def _get_samples_from_queue(self) -> tuple[None, None, None] | tuple[int, dict, Any]: + def _get_samples_from_queue(self) -> tuple[None, None] | tuple[int, Any]: """ 从消息队列获取样本并组成gen_batch_output @@ -155,24 +161,16 @@ def _get_samples_from_queue(self) -> tuple[None, None, None] | tuple[int, dict, if not queue_samples or len(queue_samples) == 0: logger.warning("required_samples is empty") - return None, None, None + return None, None logger.info(f"Retrieved {len(queue_samples)} samples from queue") - # 组装gen_batch_output - gen_batch_output = self._assemble_gen_batch_output_from_queue_samples( - queue_samples, n_responses_per_prompt, batch_size - ) - - # 从第一个样本中提取原始batch信息来构造batch_dict - first_sample = queue_samples[0].data - batch_dict = self._extract_batch_dict_from_sample(first_sample, batch_size) + # 组装 batch + batch = self._assemble_gen_batch_output_from_queue_samples(queue_samples) - return 0, batch_dict, gen_batch_output + return 0, batch - def _assemble_gen_batch_output_from_queue_samples( - self, queue_samples: list[QueueSample], n_responses_per_prompt: int, batch_size: int - ): + def _assemble_gen_batch_output_from_queue_samples(self, queue_samples: list[QueueSample]): """ 从队列样本中组装gen_batch_output @@ -185,123 +183,57 @@ def _assemble_gen_batch_output_from_queue_samples( DataProto: 组装好的gen_batch_output """ import numpy as np - import torch from verl.protocol import DataProto - # 提取所有样本的数据 + if not queue_samples: + raise ValueError("Empty queue_samples provided for batch assembly") + + logger.debug(f"Assembling batch from {len(queue_samples)} queue samples") + + # 提取所有样本的数据和元数据 sample_data_list = [] rollout_metadata_list = [] timing_info = {} - for sample in queue_samples: + for i, sample in enumerate(queue_samples): sample_data_list.append(sample.data) rollout_metadata_list.append(sample.rollout_metadata) - # 假设所有样本具有相同的数据结构,从第一个样本推断结构 - first_sample_data = sample_data_list[0] - - # 组装tensor数据 - tensor_dict = {} - non_tensor_dict = {} - - # 获取第一个样本的结构来初始化 - if hasattr(first_sample_data, "batch") and first_sample_data.batch is not None: - # 处理tensor数据 - for key in first_sample_data.batch.keys(): - tensor_list = [] - for sample_data in sample_data_list: - if hasattr(sample_data, "batch") and sample_data.batch is not None and key in sample_data.batch: - tensor_list.append(sample_data.batch[key]) - else: - logger.warning(f"Missing key '{key}' in sample batch data") - - if tensor_list: - # 连接所有tensor - tensor_dict[key] = torch.cat(tensor_list, dim=0) - - if hasattr(first_sample_data, "non_tensor_batch") and first_sample_data.non_tensor_batch: - # 处理non_tensor数据 - for key in first_sample_data.non_tensor_batch.keys(): - non_tensor_list = [] - for sample_data in sample_data_list: - if ( - hasattr(sample_data, "non_tensor_batch") - and sample_data.non_tensor_batch - and key in sample_data.non_tensor_batch - ): - non_tensor_list.extend(sample_data.non_tensor_batch[key]) - else: - logger.warning(f"Missing key '{key}' in sample non_tensor_batch data") - - if non_tensor_list: - non_tensor_dict[key] = np.array(non_tensor_list, dtype=object) + batch = DataProto.from_items(sample_data_list) # 收集timing信息和metadata - for sample, metadata in zip(queue_samples, rollout_metadata_list, strict=False): + param_versions = [] + sample_timestamps = [] + for metadata in rollout_metadata_list: + # 提取参数版本和时间戳 + param_versions.append(metadata.get("rollout_param_version", 0)) + sample_timestamps.append(metadata.get("generation_timestamp", time.time())) if "timing" in metadata: for timing_key, timing_value in metadata["timing"].items(): if timing_key not in timing_info: timing_info[timing_key] = [] - timing_info[timing_key].append(timing_value) - + # if isinstance(timing_value, (int, float)): + # timing_info[timing_key].append(timing_value) # 计算平均timing avg_timing = {} for key, values in timing_info.items(): - if values: + if values and len(values) > 0: avg_timing[key] = sum(values) / len(values) # 创建meta_info meta_info = { "timing": avg_timing, "queue_sample_count": len(queue_samples), - "rollout_param_versions": [sample.param_version for sample in queue_samples], - "sample_timestamps": [sample.timestamp for sample in queue_samples], + "rollout_param_versions": param_versions, + "sample_timestamps": sample_timestamps, + "param_version_diversity": len(set(param_versions)), + "avg_sample_age": np.mean([time.time() - ts for ts in sample_timestamps]), } - # 创建DataProto对象 - if tensor_dict or non_tensor_dict: - gen_batch_output = DataProto.from_dict( - tensors=tensor_dict if tensor_dict else None, - non_tensors=non_tensor_dict if non_tensor_dict else None, - meta_info=meta_info, - ) - else: - # 如果没有数据,创建空的DataProto - logger.warning("No tensor or non_tensor data found in samples, creating empty DataProto") - gen_batch_output = DataProto.from_dict(meta_info=meta_info) - - logger.info(f"Assembled gen_batch_output with {len(gen_batch_output)} samples") - return gen_batch_output - - def _extract_batch_dict_from_sample(self, sample_data, batch_size: int) -> dict: - """ - 从样本数据中提取batch_dict信息 - - Args: - sample_data: 样本数据 - batch_size: 批次大小 - - Returns: - dict: batch字典 - """ - batch_dict = {} - - # 从样本中提取原始输入信息 - if hasattr(sample_data, "batch") and sample_data.batch is not None: - for key, value in sample_data.batch.items(): - # 只保留输入相关的key,去掉生成的输出 - if key in ["input_ids", "attention_mask", "position_ids"]: - # 由于我们有多个响应,需要取出原始prompt部分 - batch_dict[key] = value[:batch_size] if len(value) >= batch_size else value - - if hasattr(sample_data, "non_tensor_batch") and sample_data.non_tensor_batch: - for key, value in sample_data.non_tensor_batch.items(): - # 保留非tensor的批次数据 - if key in ["raw_prompt_ids", "raw_prompt", "multi_modal_data", "tools_kwargs", "interaction_kwargs"]: - batch_dict[key] = np.array(value[:batch_size]) if len(value) >= batch_size else np.array(value) + print(meta_info) - return batch_dict + return batch def _create_actor_rollout_classes(self): # create actor @@ -377,10 +309,6 @@ def fit(self): # 使用队列模式,不需要传统的dataloader迭代器 # 初始化获取第一批数据 while True: - epoch, batch, gen_batch_output = self._get_samples_from_queue() - if gen_batch_output is None: - break - metrics = {} timing_raw = {} @@ -394,7 +322,41 @@ def fit(self): is_last_step = self.global_steps >= self.total_training_steps with marked_timer("step", timing_raw): - batch = self._post_generate_batch(batch, gen_batch_output, metrics) + with marked_timer("gen", timing_raw, color="red"): + epoch, batch = self._get_samples_from_queue() + if batch is None: + break + + # 更新统计信息 + with self.lock: + self.processed_samples += len(batch) if isinstance(batch, list) else 1 + + # 从meta_info中获取参数版本信息 + if hasattr(batch, "meta_info") and batch.meta_info: + rollout_param_versions = batch.meta_info.get("rollout_param_versions", []) + if rollout_param_versions: + # 统计陈旧样本 + stale_count = sum(1 for v in rollout_param_versions if self.current_param_version - v > 1) + self.stale_samples_processed += stale_count + + # 添加新鲜度指标到metrics + if rollout_param_versions: + param_version_diversity = batch.meta_info.get("param_version_diversity", 0) + avg_sample_age = batch.meta_info.get("avg_sample_age", 0) + + metrics.update( + { + "freshness/param_version_diversity": param_version_diversity, + "freshness/avg_sample_age": avg_sample_age, + "freshness/stale_samples_ratio": stale_count / len(rollout_param_versions) + if rollout_param_versions + else 0, + "statistics/processed_samples": self.processed_samples, + "statistics/stale_samples_processed": self.stale_samples_processed, + "statistics/current_param_version": self.current_param_version, + } + ) + batch, reward_extra_infos_dict = self._process_batch_common(batch, metrics, timing_raw) self._log_rollout(batch, reward_extra_infos_dict, timing_raw) last_val_metrics = self._validate_metrics(is_last_step, last_val_metrics, metrics, timing_raw) @@ -430,17 +392,80 @@ def get_statistics(self) -> dict: "queue_dropped_samples": queue_stats.get("dropped_samples", 0), } + def update_param_version(self, param_version: int) -> bool: + """ + 更新trainer的参数版本,用于跟踪与rollouter的参数同步状态 + + Args: + param_version: 新的参数版本号 + + Returns: + bool: 是否成功更新 + """ + try: + with self.lock: + old_version = self.current_param_version + self.current_param_version = param_version + self.param_sync_count += 1 + + # 更新消息队列的参数版本 + if self.message_queue_client: + self.message_queue_client.update_param_version(param_version) + + logger.info(f"Updated trainer param version from {old_version} to {param_version}") + return True + except Exception as e: + logger.error(f"Error updating param version: {e}") + return False + def _compute_sample_freshness_metrics(self, batch_samples: list[QueueSample]) -> dict: - """计算样本新鲜度指标""" - sample_ages = [self.current_param_version - sample.param_version for sample in batch_samples] - current_time = time.time() - sample_latencies = [current_time - sample.timestamp for sample in batch_samples] + """ + 计算样本新鲜度指标 - return { - "freshness/avg_sample_age": np.mean(sample_ages), - "freshness/max_sample_age": max(sample_ages), - "freshness/min_sample_age": min(sample_ages), - "freshness/avg_sample_latency": np.mean(sample_latencies), - "freshness/max_sample_latency": max(sample_latencies), - "freshness/stale_samples_ratio": sum(1 for age in sample_ages if age > 1) / len(sample_ages), - } + Args: + batch_samples: 队列样本列表 + + Returns: + dict: 新鲜度指标字典 + """ + if not batch_samples: + return {} + + try: + # 提取参数版本和时间戳 + sample_ages = [] + sample_latencies = [] + current_time = time.time() + + for sample in batch_samples: + # 从rollout_metadata中获取信息 + if hasattr(sample, "rollout_metadata") and sample.rollout_metadata: + rollout_version = sample.rollout_metadata.get("rollout_param_version", 0) + generation_time = sample.rollout_metadata.get("generation_timestamp", current_time) + else: + rollout_version = 0 + generation_time = current_time + + age = max(0, self.current_param_version - rollout_version) + latency = max(0, current_time - generation_time) + + sample_ages.append(age) + sample_latencies.append(latency) + + if not sample_ages: + return {} + + return { + "freshness/avg_sample_age": np.mean(sample_ages), + "freshness/max_sample_age": max(sample_ages), + "freshness/min_sample_age": min(sample_ages), + "freshness/avg_sample_latency": np.mean(sample_latencies), + "freshness/max_sample_latency": max(sample_latencies), + "freshness/min_sample_latency": min(sample_latencies), + "freshness/stale_samples_ratio": sum(1 for age in sample_ages if age > 1) / len(sample_ages), + "freshness/sample_count": len(sample_ages), + } + + except Exception as e: + logger.error(f"Error computing freshness metrics: {e}") + return {"freshness/error": str(e)} diff --git a/recipe/fully_async_policy/message_queue.py b/recipe/fully_async_policy/message_queue.py index f4dcd1f522d..ae4ba6c45ad 100644 --- a/recipe/fully_async_policy/message_queue.py +++ b/recipe/fully_async_policy/message_queue.py @@ -14,8 +14,6 @@ import logging import threading -import time -import uuid from collections import deque from dataclasses import dataclass from typing import Any diff --git a/recipe/fully_async_policy/unittest/protocol_examples.py b/recipe/fully_async_policy/unittest/protocol_examples.py new file mode 100644 index 00000000000..b695c163c23 --- /dev/null +++ b/recipe/fully_async_policy/unittest/protocol_examples.py @@ -0,0 +1,202 @@ +# Copyright 2025 Meituan Ltd. and/or its affiliates +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import numpy as np +import torch + +from verl.protocol import DataProto, DataProtoItem + + +def example_basic_split_merge(): + """Basic example of splitting DataProto into DataProtoItems and merging back.""" + print("=== Basic Split and Merge Example ===") + + # Create sample data + batch_size = 3 + seq_len = 5 + + # Create tensors + input_ids = torch.randint(0, 1000, (batch_size, seq_len)) + attention_mask = torch.ones(batch_size, seq_len) + + # Create non-tensor data + prompts = np.array(["Hello world", "How are you?", "Good morning"], dtype=object) + scores = np.array([0.8, 0.9, 0.7], dtype=object) + + # Create DataProto + data_proto = DataProto.from_dict( + tensors={"input_ids": input_ids, "attention_mask": attention_mask}, + non_tensors={"prompts": prompts, "scores": scores}, + meta_info={"model_name": "test_model", "version": "1.0"}, + ) + + print(f"Original DataProto length: {len(data_proto)}") + print(f"Input IDs shape: {data_proto.batch['input_ids'].shape}") + print(f"Prompts: {data_proto.non_tensor_batch['prompts']}") + + # Split into DataProtoItems + items = data_proto.to_items() + print(f"\nSplit into {len(items)} items") + + for i, item in enumerate(items): + print(f"Item {i}:") + print(f" Input IDs shape: {item.batch['input_ids'].shape}") + print(f" Prompt: {item.non_tensor_batch['prompts']}") + print(f" Score: {item.non_tensor_batch['scores']}") + + # Merge back to DataProto + merged_proto = DataProto.from_items(items) + print(f"\nMerged DataProto length: {len(merged_proto)}") + print(f"Merged Input IDs shape: {merged_proto.batch['input_ids'].shape}") + print(f"Merged prompts: {merged_proto.non_tensor_batch['prompts']}") + + # Verify they're identical + assert torch.equal(data_proto.batch["input_ids"], merged_proto.batch["input_ids"]) + assert torch.equal(data_proto.batch["attention_mask"], merged_proto.batch["attention_mask"]) + assert np.array_equal(data_proto.non_tensor_batch["prompts"], merged_proto.non_tensor_batch["prompts"]) + assert np.array_equal(data_proto.non_tensor_batch["scores"], merged_proto.non_tensor_batch["scores"]) + + print("\n✓ Original and merged DataProto are identical!") + + +def example_item_processing(): + """Example showing individual item processing before merging.""" + print("\n=== Individual Item Processing Example ===") + + # Create initial data + # batch_size = 4 + + values = torch.tensor([1.0, 2.0, 3.0, 4.0]).unsqueeze(1) # Shape: (4, 1) + labels = np.array(["A", "B", "C", "D"], dtype=object) + + original_proto = DataProto.from_dict( + tensors={"values": values}, non_tensors={"labels": labels}, meta_info={"processing_step": 0} + ) + + print(f"Original values: {original_proto.batch['values'].flatten()}") + print(f"Original labels: {original_proto.non_tensor_batch['labels']}") + + # Split and process each item individually + items = original_proto.to_items() + processed_items = [] + + for i, item in enumerate(items): + # Process the tensor data (multiply by 2) + processed_value = item.batch["values"] * 2 + + # Process the non-tensor data (add suffix) + processed_label = item.non_tensor_batch["labels"] + f"_processed_{i}" + + # Create new processed item + processed_item = DataProtoItem( + batch=item.batch.clone(), # Clone the TensorDict + non_tensor_batch=item.non_tensor_batch.copy(), + meta_info=item.meta_info.copy(), + ) + + # Update with processed data + processed_item.batch["values"] = processed_value + processed_item.non_tensor_batch["labels"] = processed_label + processed_item.meta_info["processing_step"] = 1 + + processed_items.append(processed_item) + + print(f"Processed item {i}: value={processed_value.item()}, label='{processed_label}'") + + # Merge processed items back + processed_proto = DataProto.from_items(processed_items) + + print(f"\nProcessed values: {processed_proto.batch['values'].flatten()}") + print(f"Processed labels: {processed_proto.non_tensor_batch['labels']}") + print(f"Processing step: {processed_proto.meta_info['processing_step']}") + + +def example_convenience_methods(): + """Example showing convenience methods.""" + print("\n=== Convenience Methods Example ===") + + # Create a single DataProtoItem + single_tensor = torch.tensor([42]).unsqueeze(0) # Shape: (1,) + single_item = DataProtoItem( + batch=None, # We'll create TensorDict manually + non_tensor_batch={"text": "Hello"}, + meta_info={"source": "manual"}, + ) + + # Create TensorDict manually for the single item + from tensordict import TensorDict + + single_item.batch = TensorDict({"data": single_tensor}, batch_size=(1,)) + + print(f"Single item data: {single_item.batch['data']}") + print(f"Single item text: {single_item.non_tensor_batch['text']}") + + # Convert single item to DataProto using convenience method + single_proto = single_item.to_proto() + print(f"Converted to DataProto length: {len(single_proto)}") + + # Create multiple items and use static convenience method + items = [single_item] + for i in range(2): + new_item = single_item.copy() # Use the copy method + new_item.batch["data"] = torch.tensor([100 + i]).unsqueeze(0) + new_item.non_tensor_batch["text"] = f"Item {i + 1}" + items.append(new_item) + + # Use DataProtoItem.from_items() convenience method + merged_proto = DataProtoItem.from_items(items) + print(f"Merged using convenience method - length: {len(merged_proto)}") + print(f"Data: {merged_proto.batch['data'].flatten()}") + print(f"Texts: {merged_proto.non_tensor_batch['text']}") + + +def example_error_handling(): + """Example showing error handling.""" + print("\n=== Error Handling Example ===") + + # Try to create DataProto from empty list + try: + DataProto.from_items([]) + print("ERROR: Should have raised exception for empty list") + except ValueError as e: + print(f"✓ Correctly caught error for empty list: {e}") + + # Try to merge items with inconsistent structure + try: + item1 = DataProtoItem( + batch=TensorDict({"data": torch.tensor([1]).unsqueeze(0)}, batch_size=(1,)), + non_tensor_batch={"text": "Hello"}, + ) + item2 = DataProtoItem( + batch=TensorDict({"different_key": torch.tensor([2]).unsqueeze(0)}, batch_size=(1,)), + non_tensor_batch={"text": "World"}, + ) + + DataProto.from_items([item1, item2]) + print("ERROR: Should have raised exception for inconsistent structure") + except ValueError as e: + print(f"✓ Correctly caught error for inconsistent structure: {e}") + + +if __name__ == "__main__": + # Import tensordict for the examples + from tensordict import TensorDict + + # Run all examples + example_basic_split_merge() + example_item_processing() + example_convenience_methods() + example_error_handling() + + print("\n🎉 All examples completed successfully!") diff --git a/recipe/fully_async_policy/unittest/test_protocol_split_merge.py b/recipe/fully_async_policy/unittest/test_protocol_split_merge.py new file mode 100644 index 00000000000..7c959a791bb --- /dev/null +++ b/recipe/fully_async_policy/unittest/test_protocol_split_merge.py @@ -0,0 +1,466 @@ +# Copyright 2025 Meituan Ltd. and/or its affiliates +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import numpy as np +import torch +from tensordict import TensorDict + +from verl.protocol import DataProto + + +def create_sample_dataproto(): + """Create a DataProto similar to the provided example.""" + + # Create tensor data similar to the example + batch_size = 12 + + # Tensor data + attention_mask = torch.ones(batch_size, 3072, dtype=torch.int64) + input_ids = torch.randint(0, 32000, (batch_size, 3072), dtype=torch.int64) + position_ids = torch.arange(3072).unsqueeze(0).repeat(batch_size, 1).long() + prompts = torch.randint(0, 32000, (batch_size, 1024), dtype=torch.int64) + response_mask = torch.ones(batch_size, 2048, dtype=torch.int64) + responses = torch.randint(0, 32000, (batch_size, 2048), dtype=torch.int64) + + # Non-tensor data similar to the example + data_source = np.array(["openai/gsm8k"] * batch_size, dtype=object) + ability = np.array(["math"] * batch_size, dtype=object) + + reward_model = np.array( + [ + {"ground_truth": "6", "style": "rule"}, + {"ground_truth": "6", "style": "rule"}, + {"ground_truth": "220000", "style": "rule"}, + {"ground_truth": "277", "style": "rule"}, + {"ground_truth": "277", "style": "rule"}, + {"ground_truth": "35", "style": "rule"}, + {"ground_truth": "6", "style": "rule"}, + {"ground_truth": "220000", "style": "rule"}, + {"ground_truth": "220000", "style": "rule"}, + {"ground_truth": "277", "style": "rule"}, + {"ground_truth": "35", "style": "rule"}, + {"ground_truth": "35", "style": "rule"}, + ], + dtype=object, + ) + + extra_info = np.array( + [ + {"answer": "Answer 1", "index": 4570, "question": "Question 1", "split": "train"}, + {"answer": "Answer 1", "index": 4570, "question": "Question 1", "split": "train"}, + {"answer": "Answer 2", "index": 460, "question": "Question 2", "split": "train"}, + {"answer": "Answer 3", "index": 6613, "question": "Question 3", "split": "train"}, + {"answer": "Answer 3", "index": 6613, "question": "Question 3", "split": "train"}, + {"answer": "Answer 4", "index": 1421, "question": "Question 4", "split": "train"}, + {"answer": "Answer 1", "index": 4570, "question": "Question 1", "split": "train"}, + {"answer": "Answer 2", "index": 460, "question": "Question 2", "split": "train"}, + {"answer": "Answer 2", "index": 460, "question": "Question 2", "split": "train"}, + {"answer": "Answer 3", "index": 6613, "question": "Question 3", "split": "train"}, + {"answer": "Answer 4", "index": 1421, "question": "Question 4", "split": "train"}, + {"answer": "Answer 4", "index": 1421, "question": "Question 4", "split": "train"}, + ], + dtype=object, + ) + + uid = np.array( + [ + "80ae1835-a8db-4faa-8b42-2ffa2ca63f28", + "80ae1835-a8db-4faa-8b42-2ffa2ca63f28", + "cc529271-c2ba-4fe1-a16e-50c5f090538d", + "237ea082-350f-4193-b9a2-3a153a3a38b9", + "237ea082-350f-4193-b9a2-3a153a3a38b9", + "fab3e910-67b3-4653-bc69-377250049267", + "80ae1835-a8db-4faa-8b42-2ffa2ca63f28", + "cc529271-c2ba-4fe1-a16e-50c5f090538d", + "cc529271-c2ba-4fe1-a16e-50c5f090538d", + "237ea082-350f-4193-b9a2-3a153a3a38b9", + "fab3e910-67b3-4653-bc69-377250049267", + "fab3e910-67b3-4653-bc69-377250049267", + ], + dtype=object, + ) + + tools_kwargs = np.array([{}] * batch_size, dtype=object) + interaction_kwargs = np.array([{}] * batch_size, dtype=object) + index = np.array([4570, 4570, 460, 6613, 6613, 1421, 4570, 460, 460, 6613, 1421, 1421], dtype=object) + + # Create DataProto + data_proto = DataProto.from_dict( + tensors={ + "attention_mask": attention_mask, + "input_ids": input_ids, + "position_ids": position_ids, + "prompts": prompts, + "response_mask": response_mask, + "responses": responses, + }, + non_tensors={ + "data_source": data_source, + "ability": ability, + "reward_model": reward_model, + "extra_info": extra_info, + "uid": uid, + "tools_kwargs": tools_kwargs, + "interaction_kwargs": interaction_kwargs, + "index": index, + }, + meta_info={"global_token_num": [2141, 2141, 2161, 2151, 2151, 2130, 2141, 2161, 2161, 2151, 2130, 2130]}, + ) + + return data_proto + + +def test_basic_split_and_merge(): + """Test basic split and merge functionality.""" + print("=== Testing Basic Split and Merge ===") + + # Create sample data + original_proto = create_sample_dataproto() + original_length = len(original_proto) + + print(f"Original DataProto length: {original_length}") + print(f"Original tensor keys: {list(original_proto.batch.keys())}") + print(f"Original non_tensor keys: {list(original_proto.non_tensor_batch.keys())}") + + # Test split + items = original_proto.to_items() + + print(f"Split into {len(items)} items") + assert len(items) == original_length, f"Expected {original_length} items, got {len(items)}" + + # Verify individual items + for i, item in enumerate(items): + print(f"Item {i}: batch_size={item.batch.batch_size}, non_tensor keys={list(item.non_tensor_batch.keys())}") + + # Check that tensor shapes are correct (no batch dimension) + assert item.batch.batch_size == torch.Size([]), ( + f"Item {i} should have empty batch_size, got {item.batch.batch_size}" + ) + + # Check tensor shapes + assert item.batch["attention_mask"].shape == torch.Size([3072]), ( + f"Unexpected attention_mask shape: {item.batch['attention_mask'].shape}" + ) + assert item.batch["input_ids"].shape == torch.Size([3072]), ( + f"Unexpected input_ids shape: {item.batch['input_ids'].shape}" + ) + assert item.batch["prompts"].shape == torch.Size([1024]), ( + f"Unexpected prompts shape: {item.batch['prompts'].shape}" + ) + + # Check non-tensor data types + assert isinstance(item.non_tensor_batch["data_source"], str), ( + f"data_source should be str, got {type(item.non_tensor_batch['data_source'])}" + ) + assert isinstance(item.non_tensor_batch["reward_model"], dict), ( + f"reward_model should be dict, got {type(item.non_tensor_batch['reward_model'])}" + ) + assert isinstance(item.non_tensor_batch["extra_info"], dict), ( + f"extra_info should be dict, got {type(item.non_tensor_batch['extra_info'])}" + ) + + # Test merge + merged_proto = DataProto.from_items(items) + + print(f"Merged DataProto length: {len(merged_proto)}") + assert len(merged_proto) == original_length, f"Merged length should be {original_length}, got {len(merged_proto)}" + + # Verify tensor data consistency + for key in original_proto.batch.keys(): + original_tensor = original_proto.batch[key] + merged_tensor = merged_proto.batch[key] + + assert original_tensor.shape == merged_tensor.shape, ( + f"Shape mismatch for {key}: {original_tensor.shape} vs {merged_tensor.shape}" + ) + assert torch.equal(original_tensor, merged_tensor), f"Tensor data mismatch for {key}" + + # Verify non-tensor data consistency + for key in original_proto.non_tensor_batch.keys(): + original_array = original_proto.non_tensor_batch[key] + merged_array = merged_proto.non_tensor_batch[key] + + assert original_array.shape == merged_array.shape, ( + f"Shape mismatch for {key}: {original_array.shape} vs {merged_array.shape}" + ) + assert np.array_equal(original_array, merged_array), f"Non-tensor data mismatch for {key}" + + # Verify meta_info consistency + assert original_proto.meta_info == merged_proto.meta_info, "Meta info mismatch" + + print("✓ Basic split and merge test passed!") + + +def test_individual_item_access(): + """Test accessing individual items matches split results.""" + print("\n=== Testing Individual Item Access ===") + + original_proto = create_sample_dataproto() + items = original_proto.to_items() + + # Compare direct indexing with split results + for i in range(len(original_proto)): + direct_item = original_proto[i] + split_item = items[i] + + # Check tensor data + for key in original_proto.batch.keys(): + assert torch.equal(direct_item.batch[key], split_item.batch[key]), ( + f"Tensor mismatch at index {i}, key {key}" + ) + + # Check non-tensor data + for key in original_proto.non_tensor_batch.keys(): + if isinstance(direct_item.non_tensor_batch[key], np.ndarray): + assert np.array_equal(direct_item.non_tensor_batch[key], split_item.non_tensor_batch[key]), ( + f"Non-tensor mismatch at index {i}, key {key}" + ) + else: + assert direct_item.non_tensor_batch[key] == split_item.non_tensor_batch[key], ( + f"Non-tensor mismatch at index {i}, key {key}" + ) + + print("✓ Individual item access test passed!") + + +def test_partial_merge(): + """Test merging a subset of items.""" + print("\n=== Testing Partial Merge ===") + + original_proto = create_sample_dataproto() + items = original_proto.to_items() + + # Take a subset of items + subset_indices = [0, 2, 4, 7, 9] + subset_items = [items[i] for i in subset_indices] + + # Merge the subset + subset_proto = DataProto.from_items(subset_items) + + assert len(subset_proto) == len(subset_indices), ( + f"Subset length should be {len(subset_indices)}, got {len(subset_proto)}" + ) + + # Verify the subset contains correct data + for i, original_idx in enumerate(subset_indices): + # Compare with original data at original_idx + for key in original_proto.batch.keys(): + expected_tensor = original_proto.batch[key][original_idx] + actual_tensor = subset_proto.batch[key][i] + assert torch.equal(expected_tensor, actual_tensor), f"Subset tensor mismatch at {i}, key {key}" + + for key in original_proto.non_tensor_batch.keys(): + expected_value = original_proto.non_tensor_batch[key][original_idx] + actual_value = subset_proto.non_tensor_batch[key][i] + + if isinstance(expected_value, np.ndarray): + assert np.array_equal(expected_value, actual_value), f"Subset non-tensor mismatch at {i}, key {key}" + else: + assert expected_value == actual_value, f"Subset non-tensor mismatch at {i}, key {key}" + + print("✓ Partial merge test passed!") + + +def test_item_processing(): + """Test processing individual items before merging.""" + print("\n=== Testing Item Processing ===") + + original_proto = create_sample_dataproto() + items = original_proto.to_items() + + # Process each item (e.g., add a prefix to uid) + processed_items = [] + for i, item in enumerate(items): + processed_item = item.copy() # Create a copy to avoid modifying original + + # Modify some data + processed_item.non_tensor_batch["uid"] = f"processed_{i}_{processed_item.non_tensor_batch['uid']}" + processed_item.non_tensor_batch["processing_step"] = i + processed_item.meta_info["processed"] = True + + processed_items.append(processed_item) + + # Merge processed items + processed_proto = DataProto.from_items(processed_items) + + # Verify processing was applied + for i in range(len(processed_proto)): + expected_uid = f"processed_{i}_{items[i].non_tensor_batch['uid']}" + actual_uid = processed_proto.non_tensor_batch["uid"][i] + assert actual_uid == expected_uid, ( + f"Processing failed for uid at {i}: expected {expected_uid}, got {actual_uid}" + ) + + expected_step = i + actual_step = processed_proto.non_tensor_batch["processing_step"][i] + assert actual_step == expected_step, ( + f"Processing step mismatch at {i}: expected {expected_step}, got {actual_step}" + ) + + # assert processed_proto.meta_info.get("processed") == True, "Meta info processing failed" + + print("✓ Item processing test passed!") + + +def test_error_conditions(): + """Test error conditions.""" + print("\n=== Testing Error Conditions ===") + + # Test empty list + try: + DataProto.from_items([]) + except ValueError as e: + print(f"✓ Correctly caught empty list error: {e}") + + # Test inconsistent structure + try: + # Create items with different tensor keys + original_proto = create_sample_dataproto() + items = original_proto.to_items() + + # Modify one item to have different keys + modified_item = items[1].copy() + modified_item.batch = TensorDict({"different_key": torch.randn(3072)}, batch_size=torch.Size([])) + + inconsistent_items = [items[0], modified_item] + DataProto.from_items(inconsistent_items) + except ValueError as e: + print(f"✓ Correctly caught inconsistent structure error: {e}") + + print("✓ Error conditions test passed!") + + +def test_roundtrip_integrity(): + """Test multiple split/merge cycles maintain data integrity.""" + print("\n=== Testing Roundtrip Integrity ===") + + original_proto = create_sample_dataproto() + current_proto = original_proto + + # Perform multiple split/merge cycles + for cycle in range(3): + print(f"Cycle {cycle + 1}") + + # Split + items = current_proto.to_items() + + # Merge + current_proto = DataProto.from_items(items) + + # Verify integrity + assert len(current_proto) == len(original_proto), f"Length changed in cycle {cycle + 1}" + + for key in original_proto.batch.keys(): + assert torch.equal(original_proto.batch[key], current_proto.batch[key]), ( + f"Tensor {key} changed in cycle {cycle + 1}" + ) + + for key in original_proto.non_tensor_batch.keys(): + assert np.array_equal(original_proto.non_tensor_batch[key], current_proto.non_tensor_batch[key]), ( + f"Non-tensor {key} changed in cycle {cycle + 1}" + ) + + assert original_proto.meta_info == current_proto.meta_info, f"Meta info changed in cycle {cycle + 1}" + + print("✓ Roundtrip integrity test passed!") + + +def run_visual_comparison(): + """Run a visual comparison similar to the user's example.""" + print("\n=== Visual Comparison (Like User Example) ===") + + original_proto = create_sample_dataproto() + + print("Original DataProto:") + print(f"batch_size: {original_proto.batch.batch_size}") + print(f"tensor keys: {list(original_proto.batch.keys())}") + print(f"non_tensor keys: {list(original_proto.non_tensor_batch.keys())}") + print(f"Sample data_source: {original_proto.non_tensor_batch['data_source'][:3]}") + print(f"Sample uid: {original_proto.non_tensor_batch['uid'][:3]}") + + print("\n" + "=" * 50) + print("============= SPLIT =============") + print("=" * 50) + + items = original_proto.to_items() + + # Show first few items + for i in range(min(3, len(items))): + print(f"\nDataProtoItem {i}:") + print(f"batch_size: {items[i].batch.batch_size}") + print(f"attention_mask shape: {items[i].batch['attention_mask'].shape}") + print(f"input_ids shape: {items[i].batch['input_ids'].shape}") + print(f"data_source: {items[i].non_tensor_batch['data_source']}") + print(f"uid: {items[i].non_tensor_batch['uid']}") + print(f"reward_model: {items[i].non_tensor_batch['reward_model']}") + print("-" * 30) + + print("\n" + "=" * 50) + print("============= MERGE =============") + print("=" * 50) + + merged_proto = DataProto.from_items(items) + + print("Merged DataProto:") + print(f"batch_size: {merged_proto.batch.batch_size}") + print(f"tensor keys: {list(merged_proto.batch.keys())}") + print(f"non_tensor keys: {list(merged_proto.non_tensor_batch.keys())}") + print(f"Sample data_source: {merged_proto.non_tensor_batch['data_source'][:3]}") + print(f"Sample uid: {merged_proto.non_tensor_batch['uid'][:3]}") + + # Verify they're identical + success = True + try: + for key in original_proto.batch.keys(): + assert torch.equal(original_proto.batch[key], merged_proto.batch[key]) + for key in original_proto.non_tensor_batch.keys(): + assert np.array_equal(original_proto.non_tensor_batch[key], merged_proto.non_tensor_batch[key]) + assert original_proto.meta_info == merged_proto.meta_info + print("\n✓ Original and merged DataProto are identical!") + except Exception as e: + print(f"\n✗ Verification failed: {e}") + success = False + + return success + + +if __name__ == "__main__": + print("Testing DataProto Split/Merge Functionality") + print("=" * 60) + + try: + # Run all tests + test_basic_split_and_merge() + test_individual_item_access() + test_partial_merge() + test_item_processing() + test_error_conditions() + test_roundtrip_integrity() + + # Run visual comparison + visual_success = run_visual_comparison() + + if visual_success: + print("\n" + "=" * 60) + print("🎉 ALL TESTS PASSED!") + print("DataProto split/merge functionality is working correctly.") + else: + print("\n" + "=" * 60) + print("❌ SOME TESTS FAILED!") + + except Exception as e: + print(f"\n❌ Test failed with exception: {e}") + import traceback + + traceback.print_exc() diff --git a/verl/protocol.py b/verl/protocol.py index a4d394af97d..17b3b10c1f6 100644 --- a/verl/protocol.py +++ b/verl/protocol.py @@ -38,7 +38,7 @@ from verl.utils.py_functional import union_two_dict from verl.utils.torch_functional import allgather_dict_tensors -__all__ = ["DataProto", "union_tensor_dict"] +__all__ = ["DataProto", "DataProtoItem", "union_tensor_dict"] with contextlib.suppress(Exception): tensordict.set_lazy_legacy(False).set() @@ -198,11 +198,83 @@ def collate_fn(x: list["DataProtoItem"]): @dataclass class DataProtoItem: - # TODO(zhangchi.usc1992) add consistency check + """ + A single item from a DataProto batch, representing one sample. + This is typically used when accessing individual elements from a DataProto. + """ + batch: TensorDict = None non_tensor_batch: dict = field(default_factory=dict) meta_info: dict = field(default_factory=dict) + def __post_init__(self): + """Perform consistency checking after initialization.""" + self._check_consistency() + + def _check_consistency(self): + """Check the consistency of the DataProtoItem.""" + # For DataProtoItem, batch can have no batch dimension (batch_size=[]) or batch size 1 + if self.batch is not None: + # Allow both cases: tensors without batch dim (batch_size=[]) and tensors with batch size 1 + if hasattr(self.batch, "batch_size") and len(self.batch.batch_size) > 0: + if self.batch.batch_size[0] > 1: + raise ValueError( + f"DataProtoItem batch should have batch size 0 or 1, got {self.batch.batch_size[0]}" + ) + + # Check non_tensor_batch consistency + if self.non_tensor_batch: + for key, val in self.non_tensor_batch.items(): + # For DataProtoItem, non_tensor values should be individual items, not arrays + if isinstance(val, np.ndarray) and val.shape != (): + # Allow only scalar numpy arrays (shape=()) for individual items + if val.shape[0] > 1: + raise ValueError( + f"DataProtoItem non_tensor_batch['{key}']" + "should be a single item, got array with shape {val.shape}" + ) + + def to_proto(self) -> "DataProto": + """Convert this DataProtoItem to a DataProto with batch size 1. + + Returns: + DataProto: A DataProto containing this single item + """ + return DataProto.from_items([self]) + + @staticmethod + def from_items(items: list["DataProtoItem"]) -> "DataProto": + """Create a DataProto from a list of DataProtoItem objects. + + This is a convenience method that calls DataProto.from_items(). + + Args: + items (List[DataProtoItem]): A list of DataProtoItem objects to merge + + Returns: + DataProto: A new DataProto containing all the items as a batch + """ + return DataProto.from_items(items) + + def copy(self) -> "DataProtoItem": + """Create a deep copy of this DataProtoItem. + + Returns: + DataProtoItem: A deep copy of this item + """ + import copy + + # Deep copy the batch TensorDict + batch_copy = copy.deepcopy(self.batch) if self.batch is not None else None + + # Deep copy non_tensor_batch + non_tensor_copy = copy.deepcopy(self.non_tensor_batch) + + # Deep copy meta_info + meta_info_copy = copy.deepcopy(self.meta_info) + + return DataProtoItem(batch=batch_copy, non_tensor_batch=non_tensor_copy, meta_info=meta_info_copy) + @dataclass class DataProto: @@ -738,6 +810,96 @@ def split(self, split_size: int) -> list["DataProto"]: """ return [self[i : i + split_size] for i in range(0, len(self), split_size)] + def to_items(self) -> list["DataProtoItem"]: + """Convert DataProto to a list of DataProtoItem objects. + + Returns: + List[DataProtoItem]: A list containing individual DataProtoItem objects, + one for each sample in the batch + """ + items = [] + for i in range(len(self)): + # Use the existing __getitem__ implementation for single integer access + items.append(self[i]) + return items + + @staticmethod + def from_items(items: list["DataProtoItem"]) -> "DataProto": + """Create a DataProto from a list of DataProtoItem objects. + + Args: + items (List[DataProtoItem]): A list of DataProtoItem objects to merge + + Returns: + DataProto: A new DataProto containing all the items as a batch + + Raises: + ValueError: If the input list is empty or items have inconsistent structure + """ + if not items: + raise ValueError("Cannot create DataProto from empty list of items") + + # Get the first item to determine structure and meta_info + first_item = items[0] + meta_info = first_item.meta_info + + # Collect all tensor batches + batch_tensors = {} + non_tensor_batches = {} + + # Process tensor data + if first_item.batch is not None: + # Get all keys from the first item's batch + tensor_keys = list(first_item.batch.keys()) + + for key in tensor_keys: + tensor_list = [] + for i, item in enumerate(items): + if item.batch is None or key not in item.batch: + raise ValueError(f"Item {i} missing tensor key '{key}' in batch") + + tensor = item.batch[key] + # Handle tensors from DataProtoItem which may not have batch dimension + # (as shown in the user's example where batch_size=torch.Size([])) + if tensor.dim() == 0: + # Scalar tensor - add batch dimension + tensor = tensor.unsqueeze(0) + else: + # Multi-dimensional tensor without batch dimension - add batch dimension + tensor = tensor.unsqueeze(0) + + tensor_list.append(tensor) + + # Concatenate tensors along batch dimension + if tensor_list: + batch_tensors[key] = torch.cat(tensor_list, dim=0) + + # Process non-tensor data + if first_item.non_tensor_batch: + non_tensor_keys = list(first_item.non_tensor_batch.keys()) + + for key in non_tensor_keys: + non_tensor_list = [] + for i, item in enumerate(items): + if key not in item.non_tensor_batch: + raise ValueError(f"Item {i} missing non_tensor key '{key}'") + + non_tensor_data = item.non_tensor_batch[key] + non_tensor_list.append(non_tensor_data) + + # Stack non-tensor data + if non_tensor_list: + non_tensor_batches[key] = np.array(non_tensor_list, dtype=object) + + # Create TensorDict for batch + if batch_tensors: + batch_size = len(items) + batch = TensorDict(source=batch_tensors, batch_size=(batch_size,)) + else: + batch = None + + return DataProto(batch=batch, non_tensor_batch=non_tensor_batches, meta_info=meta_info) + @staticmethod def concat(data: list["DataProto"]) -> "DataProto": """Concat a list of DataProto. The batch is concatenated among dim=0. From bc6aedd8f0caf8a97b2e7a90547158fef47d8d77 Mon Sep 17 00:00:00 2001 From: ArronHZG Date: Wed, 6 Aug 2025 17:44:48 +0800 Subject: [PATCH 027/182] train one step --- recipe/fully_async_policy/fully_async_main.py | 12 +- .../fully_async_policy/fully_async_trainer.py | 97 ++++--- recipe/fully_async_policy/message_queue.py | 8 +- recipe/fully_async_policy/unittest/test_mq.py | 242 +++++++++++++++++- .../fully_async_policy/unittest/test_mq2.py | 171 +++++++++++++ tests/special_e2e/run_fully_async_policy.sh | 6 +- verl/trainer/ppo/ray_trainer.py | 9 + 7 files changed, 496 insertions(+), 49 deletions(-) create mode 100644 recipe/fully_async_policy/unittest/test_mq2.py diff --git a/recipe/fully_async_policy/fully_async_main.py b/recipe/fully_async_policy/fully_async_main.py index 404ffba4874..888c6c73594 100644 --- a/recipe/fully_async_policy/fully_async_main.py +++ b/recipe/fully_async_policy/fully_async_main.py @@ -241,7 +241,7 @@ def _initialize_components(self, config) -> None: # 创建Trainer print("Creating FullyAsyncTrainer...") - # self._create_trainer(config) + self._create_trainer(config) # 设置参数同步 # print("Setting up parameter synchronization...") @@ -311,11 +311,15 @@ def _run_training_loop(self): print("Starting Rollouter in background...") rollouter_future = self.components["rollouter"].fit.remote() - # trainer_future = self.components["trainer"].fit.remote() + trainer_future = self.components["trainer"].fit.remote() # self._monitor_components() - ray.get(rollouter_future) - # ray.get(trainer_future) + print("Starting Trainer...") + time.sleep(10) + print("Starting Trainer...") + + ray.get(rollouter_future) + ray.get(trainer_future) self.components["message_queue_client"].clear_queue() print("Training completed or interrupted") diff --git a/recipe/fully_async_policy/fully_async_trainer.py b/recipe/fully_async_policy/fully_async_trainer.py index 5d69e9091ba..9830aef595e 100644 --- a/recipe/fully_async_policy/fully_async_trainer.py +++ b/recipe/fully_async_policy/fully_async_trainer.py @@ -152,22 +152,31 @@ def _get_samples_from_queue(self) -> tuple[None, None] | tuple[int, Any]: batch_size = self.config.data.train_batch_size required_samples = n_responses_per_prompt * batch_size - logger.info( - f"Requesting {required_samples} samples from queue (n={n_responses_per_prompt}, batch_size={batch_size})" + print( + f"Requesting {required_samples} samples from queue (n={n_responses_per_prompt}, batch_size={batch_size})", + flush=True, ) # 从队列获取样本 + consumer_start = time.time() queue_samples = self.message_queue_client.get_samples(min_batch_count=required_samples) + consumer_end = time.time() if not queue_samples or len(queue_samples) == 0: logger.warning("required_samples is empty") return None, None - logger.info(f"Retrieved {len(queue_samples)} samples from queue") + print(f"Retrieved {len(queue_samples)} samples from queue. wait time {consumer_end - consumer_start}") + + queue_samples = [ray.cloudpickle.loads(x) for x in queue_samples] + print(queue_samples) # 组装 batch batch = self._assemble_gen_batch_output_from_queue_samples(queue_samples) + print("=" * 200) + print(batch) + return 0, batch def _assemble_gen_batch_output_from_queue_samples(self, queue_samples: list[QueueSample]): @@ -189,7 +198,7 @@ def _assemble_gen_batch_output_from_queue_samples(self, queue_samples: list[Queu if not queue_samples: raise ValueError("Empty queue_samples provided for batch assembly") - logger.debug(f"Assembling batch from {len(queue_samples)} queue samples") + print(f"Assembling batch from {len(queue_samples)} queue samples") # 提取所有样本的数据和元数据 sample_data_list = [] @@ -271,6 +280,8 @@ def fit(self): The light-weight advantage computation is done on the driver process. """ + print("FullyAsyncTrainer run") + if self.message_queue_client is None: raise ValueError("MessageQueue client not set. Call set_message_queue_client() first.") @@ -294,10 +305,13 @@ def fit(self): val_metrics = self._validate() assert val_metrics, f"{val_metrics=}" pprint(f"Initial validation metrics: {val_metrics}") - logger.log(data=val_metrics, step=self.global_steps) + print(data=val_metrics, step=self.global_steps) if self.config.trainer.get("val_only", False): return + self.total_training_steps = self.config.trainer.total_training_steps + + print(f"Total training steps: {self.total_training_steps}") # add tqdm progress_bar = tqdm(total=self.total_training_steps, initial=self.global_steps, desc="Training Progress") @@ -309,6 +323,7 @@ def fit(self): # 使用队列模式,不需要传统的dataloader迭代器 # 初始化获取第一批数据 while True: + print("while True", flush=True) metrics = {} timing_raw = {} @@ -327,47 +342,55 @@ def fit(self): if batch is None: break - # 更新统计信息 - with self.lock: - self.processed_samples += len(batch) if isinstance(batch, list) else 1 - - # 从meta_info中获取参数版本信息 - if hasattr(batch, "meta_info") and batch.meta_info: - rollout_param_versions = batch.meta_info.get("rollout_param_versions", []) - if rollout_param_versions: - # 统计陈旧样本 - stale_count = sum(1 for v in rollout_param_versions if self.current_param_version - v > 1) - self.stale_samples_processed += stale_count - - # 添加新鲜度指标到metrics - if rollout_param_versions: - param_version_diversity = batch.meta_info.get("param_version_diversity", 0) - avg_sample_age = batch.meta_info.get("avg_sample_age", 0) - - metrics.update( - { - "freshness/param_version_diversity": param_version_diversity, - "freshness/avg_sample_age": avg_sample_age, - "freshness/stale_samples_ratio": stale_count / len(rollout_param_versions) - if rollout_param_versions - else 0, - "statistics/processed_samples": self.processed_samples, - "statistics/stale_samples_processed": self.stale_samples_processed, - "statistics/current_param_version": self.current_param_version, - } - ) - + print("_get_samples_from_queue end") + + # # 更新统计信息 + # with self.lock: + # self.processed_samples += len(batch) if isinstance(batch, list) else 1 + # + # # 从meta_info中获取参数版本信息 + # if hasattr(batch, "meta_info") and batch.meta_info: + # rollout_param_versions = batch.meta_info.get("rollout_param_versions", []) + # if rollout_param_versions: + # # 统计陈旧样本 + # stale_count = sum(1 for v in rollout_param_versions if self.current_param_version - v > 1) + # self.stale_samples_processed += stale_count + # + # # 添加新鲜度指标到metrics + # if rollout_param_versions: + # param_version_diversity = batch.meta_info.get("param_version_diversity", 0) + # avg_sample_age = batch.meta_info.get("avg_sample_age", 0) + # + # metrics.update( + # { + # "freshness/param_version_diversity": param_version_diversity, + # "freshness/avg_sample_age": avg_sample_age, + # "freshness/stale_samples_ratio": stale_count / len(rollout_param_versions) + # if rollout_param_versions + # else 0, + # "statistics/processed_samples": self.processed_samples, + # "statistics/stale_samples_processed": self.stale_samples_processed, + # "statistics/current_param_version": self.current_param_version, + # } + # ) + print("_process_batch_common") batch, reward_extra_infos_dict = self._process_batch_common(batch, metrics, timing_raw) + print("_log_rollout") self._log_rollout(batch, reward_extra_infos_dict, timing_raw) + print("_validate_metrics") last_val_metrics = self._validate_metrics(is_last_step, last_val_metrics, metrics, timing_raw) + print("_check_save_checkpoint") self._check_save_checkpoint(is_last_step, timing_raw) + print("_stop_profiling") self._stop_profiling(do_profile, timing_raw) + print("_collect_metrics") self._collect_metrics(batch, epoch, metrics, timing_raw) + print("_post_batch_processing") self._post_batch_processing(batch) # TODO: make a canonical logger that supports various backend - logger.log(data=metrics, step=self.global_steps) + print(data=metrics, step=self.global_steps) progress_bar.update(1) self.global_steps += 1 @@ -412,7 +435,7 @@ def update_param_version(self, param_version: int) -> bool: if self.message_queue_client: self.message_queue_client.update_param_version(param_version) - logger.info(f"Updated trainer param version from {old_version} to {param_version}") + print(f"Updated trainer param version from {old_version} to {param_version}") return True except Exception as e: logger.error(f"Error updating param version: {e}") diff --git a/recipe/fully_async_policy/message_queue.py b/recipe/fully_async_policy/message_queue.py index ae4ba6c45ad..c6116f0c432 100644 --- a/recipe/fully_async_policy/message_queue.py +++ b/recipe/fully_async_policy/message_queue.py @@ -93,7 +93,7 @@ def put_sample(self, sample: Any, param_version: int) -> bool: if len(self.queue) >= self.max_queue_size: removed = self.queue.popleft() self.dropped_samples += 1 - logger.warning(f"Queue full, dropped sample {removed.id}") + logger.warning(f"Queue full, dropped sample {removed}") self.queue.append(sample) self.total_produced += 1 @@ -105,7 +105,7 @@ def put_sample(self, sample: Any, param_version: int) -> bool: return True - def get_samples(self, min_batch_count: int = 1) -> list[QueueSample]: + def get_samples(self, min_batch_count: int = 1) -> list[Any]: """ 从队列获取batch样本,一直等待直到有足够样本 @@ -113,7 +113,7 @@ def get_samples(self, min_batch_count: int = 1) -> list[QueueSample]: min_batch_count: sample数量满足min_batch,一次性获取 Returns: - List[QueueSample]: 获取的样本列表 + List[Any]: 获取的样本列表 """ with self.lock: while len(self.queue) < min_batch_count and self.running: @@ -212,7 +212,7 @@ def put_sample(self, sample: Any, param_version: int) -> bool: """放入batch到队列""" return ray.get(self.queue_actor.put_sample.remote(sample, param_version)) - def get_samples(self, min_batch_count: int = 1) -> list[QueueSample]: + def get_samples(self, min_batch_count: int = 1) -> list[Any]: """从队列获取batch,一直等待直到有足够样本""" return ray.get(self.queue_actor.get_samples.remote(min_batch_count)) diff --git a/recipe/fully_async_policy/unittest/test_mq.py b/recipe/fully_async_policy/unittest/test_mq.py index 2fff49d6576..b766c60f858 100644 --- a/recipe/fully_async_policy/unittest/test_mq.py +++ b/recipe/fully_async_policy/unittest/test_mq.py @@ -316,7 +316,247 @@ def consumer(): finally: client.shutdown() + def test_consume_first_produce_later(self, message_queue_client, mock_data_proto): + """测试先消费、后生产的场景 - 验证阻塞和唤醒机制""" + consumer_result = [] + producer_result = [] + start_time = time.time() + + def consumer_task(): + """消费者任务:先启动,等待生产者生产数据""" + try: + # 记录开始消费的时间 + consumer_start = time.time() + # 这里会阻塞等待,直到有至少2个样本可用 + samples = message_queue_client.get_samples(min_batch_count=2) + consumer_end = time.time() + + consumer_result.append( + { + "success": True, + "samples_count": len(samples), + "wait_time": consumer_end - consumer_start, + "samples": samples, + } + ) + except Exception as e: + consumer_result.append({"success": False, "error": str(e), "wait_time": time.time() - consumer_start}) + + def producer_task(): + """生产者任务:延迟1秒后开始生产""" + try: + # 延迟1秒,确保消费者先开始等待 + time.sleep(1.0) + producer_start = time.time() + + # 分两次放入,验证消费者会等到足够的样本数量 + samples_1 = mock_data_proto + result1 = message_queue_client.put_sample( + sample=samples_1, param_version=1, rollout_metadata=[{"batch": "first"}] + ) + + # 短暂延迟后放入第二批 + time.sleep(0.1) + samples_2 = mock_data_proto + result2 = message_queue_client.put_sample( + sample=samples_2, param_version=1, rollout_metadata=[{"batch": "second"}] + ) + + samples_2 = mock_data_proto + result3 = message_queue_client.put_sample( + sample=samples_2, param_version=1, rollout_metadata=[{"batch": "second"}] + ) + + producer_end = time.time() + producer_result.append( + { + "success": result1 and result2, + "put_count": 2, + "produce_time": producer_end - producer_start, + "result1": result1, + "result2": result2, + } + ) + + print("produce finish") + + except Exception as e: + producer_result.append({"success": False, "error": str(e)}) + + # 启动消费者线程(先启动) + consumer_thread = threading.Thread(target=consumer_task, name="Consumer") + # 启动生产者线程(后启动) + producer_thread = threading.Thread(target=producer_task, name="Producer") + + consumer_thread.start() + time.sleep(0.1) # 确保消费者先开始等待 + producer_thread.start() + + print("=========") + # + # # 等待两个线程完成(设置超时避免死锁) + producer_thread.join() + # print("producer_result", producer_result) + # consumer_thread.join() + # print("consumer_thread", consumer_result) + # + # total_time = time.time() - start_time + # + # # 验证结果 + # assert len(consumer_result) == 1, "消费者应该执行一次" + # + # consumer_data = consumer_result[0] + # producer_data = producer_result[0] + # + # # 验证生产者成功 + # assert producer_data['success'], f"生产者失败: {producer_data.get('error', '')}" + # assert producer_data['put_count'] == 2, "应该生产2批数据" + # + # # 验证消费者成功 + # assert consumer_data['success'], f"消费者失败: {consumer_data.get('error', '')}" + # assert consumer_data['samples_count'] == 2, "消费者应该获取到2个样本" + # + # # 验证时序:消费者等待时间应该大于1秒(生产者的延迟时间) + # assert consumer_data['wait_time'] >= 1.0, f"消费者等待时间应该≥1秒,实际: {consumer_data['wait_time']:.2f}秒" + # + # # 验证数据完整性 + # assert all(isinstance(sample, QueueSample) for sample in consumer_data['samples']), "获取的样本应该是QueueSample类型" + # + # # 验证队列状态 + # final_queue_size = message_queue_client.get_queue_size() + # assert final_queue_size == 0, "队列应该被清空" + # + # stats = message_queue_client.get_statistics() + # assert stats['total_produced'] == 2, "应该生产了2个样本" + # assert stats['total_consumed'] == 2, "应该消费了2个样本" + # + # print(f"测试成功完成,总耗时: {total_time:.2f}秒") + # print(f"消费者等待时间: {consumer_data['wait_time']:.2f}秒") + # print(f"生产者执行时间: {producer_data['produce_time']:.2f}秒") + + def test_multiple_consumers_single_producer(self, message_queue_client, mock_data_proto): + """测试多个消费者等待单个生产者的场景""" + consumer_results = [] + producer_result = [] + + def consumer_task(consumer_id): + """消费者任务""" + try: + start_time = time.time() + samples = message_queue_client.get_samples(min_batch_count=1) + end_time = time.time() + + consumer_results.append( + { + "id": consumer_id, + "success": True, + "samples_count": len(samples), + "wait_time": end_time - start_time, + } + ) + except Exception as e: + consumer_results.append({"id": consumer_id, "success": False, "error": str(e)}) + + def producer_task(): + """生产者任务:延迟后批量生产""" + try: + time.sleep(1.5) # 确保所有消费者都在等待 + + # 生产3批数据,每批1个样本,供3个消费者消费 + for i in range(3): + samples = [mock_data_proto] + result = message_queue_client.put_sample( + sample=samples, param_version=1, rollout_metadata=[{"batch_id": i}] + ) + producer_result.append(result) + time.sleep(0.1) # 短暂间隔 + + except Exception as e: + producer_result.append(False) + + print("# 启动3个消费者线程") + # consumer_threads = [] + # for i in range(3): + # thread = threading.Thread(target=consumer_task, args=(i,), name=f"Consumer-{i}") + # consumer_threads.append(thread) + # thread.start() + # time.sleep(0.1) # 错开启动时间 + # + # # 启动生产者线程 + # producer_thread = threading.Thread(target=producer_task, name="Producer") + # producer_thread.start() + # + # # 等待所有线程完成 + # producer_thread.join(timeout=10) + # for thread in consumer_threads: + # thread.join(timeout=10) + # + # # 验证结果 + # assert len(consumer_results) == 3, "应该有3个消费者结果" + # assert len(producer_result) == 3, "应该生产3批数据" + # + # # 验证所有消费者都成功 + # for result in consumer_results: + # assert result['success'], f"消费者{result['id']}失败: {result.get('error', '')}" + # assert result['samples_count'] == 1, f"消费者{result['id']}应该获取1个样本" + # assert result['wait_time'] >= 1.5, f"消费者{result['id']}等待时间应该≥1.5秒" + # + # # 验证生产者都成功 + # assert all(producer_result), "所有生产操作都应该成功" + # + # # 验证最终状态 + # final_stats = message_queue_client.get_statistics() + # assert final_stats['total_produced'] == 3, "应该总共生产3个样本" + # assert final_stats['total_consumed'] == 3, "应该总共消费3个样本" + # assert final_stats['queue_size'] == 0, "队列应该被清空" + + def test_consumer_timeout_scenario(self, message_queue_client, mock_data_proto): + """测试消费者超时场景(通过关闭队列来模拟)""" + consumer_result = [] + + def consumer_task(): + """消费者任务:等待样本""" + try: + start_time = time.time() + # 尝试获取样本,但没有生产者会生产数据 + samples = message_queue_client.get_samples(min_batch_count=2) + end_time = time.time() + + consumer_result.append( + {"success": True, "samples_count": len(samples), "wait_time": end_time - start_time} + ) + except Exception as e: + consumer_result.append({"success": False, "error": str(e)}) + + def shutdown_task(): + """延迟关闭队列,模拟超时场景""" + time.sleep(2.0) # 让消费者等待2秒 + message_queue_client.shutdown() + + # 启动消费者和关闭任务 + consumer_thread = threading.Thread(target=consumer_task, name="Consumer") + shutdown_thread = threading.Thread(target=shutdown_task, name="Shutdown") + + consumer_thread.start() + time.sleep(0.1) + shutdown_thread.start() + + # 等待线程完成 + shutdown_thread.join(timeout=5) + consumer_thread.join(timeout=5) + + # 验证结果 + assert len(consumer_result) == 1, "应该有一个消费者结果" + + result = consumer_result[0] + # 消费者应该在队列关闭后返回空列表 + if result["success"]: + assert result["samples_count"] == 0, "关闭后应该返回空样本列表" + + print(f"消费者等待了 {result.get('wait_time', 0):.2f} 秒后退出") + + # 运行测试的示例配置 + -# 运行测试的示例配置 if __name__ == "__main__": pytest.main([__file__, "-v", "--tb=short"]) diff --git a/recipe/fully_async_policy/unittest/test_mq2.py b/recipe/fully_async_policy/unittest/test_mq2.py new file mode 100644 index 00000000000..d846a16dcb7 --- /dev/null +++ b/recipe/fully_async_policy/unittest/test_mq2.py @@ -0,0 +1,171 @@ +# Copyright 2025 Meituan Ltd. and/or its affiliates +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import threading +import time +from unittest.mock import Mock + +import pytest +import ray +from omegaconf import DictConfig + +from recipe.fully_async_policy.message_queue import MessageQueue, MessageQueueClient, QueueSample + + +@pytest.fixture +def mock_data_proto(): + """Mock数据对象""" + return Mock() + + +@pytest.fixture +def basic_config(): + """基础配置""" + return DictConfig({"async_training": {"staleness_threshold": 3}}) + + +@pytest.fixture +def queue_config(): + """队列配置""" + return DictConfig({"async_training": {"staleness_threshold": 2}}) + + +@pytest.fixture +def ray_setup(): + """设置Ray环境""" + if not ray.is_initialized(): + ray.init(local_mode=True, ignore_reinit_error=True) + yield + ray.shutdown() + + +@pytest.fixture +def message_queue_client(ray_setup, basic_config): + """创建MessageQueue actor并返回其客户端""" + actor = MessageQueue.remote(basic_config, max_queue_size=10) + client = MessageQueueClient(actor) + yield client + client.shutdown() + + +class TestConcurrency: + """测试并发场景""" + + def setup_method(self): + """每个测试方法前的设置""" + if not ray.is_initialized(): + ray.init() + + def teardown_method(self): + """每个测试方法后的清理""" + if ray.is_initialized(): + ray.shutdown() + + def create_message_queue_client(self, config=None): + """创建MessageQueue client的辅助方法""" + if config is None: + config = DictConfig({"async_training": {"staleness_threshold": 3}}) + actor = MessageQueue.remote(config, max_queue_size=10) + return MessageQueueClient(actor) + + def test_consume_first_produce_later(self, message_queue_client, mock_data_proto): + """测试先消费、后生产的场景 - 验证阻塞和唤醒机制""" + consumer_result = [] + producer_result = [] + start_time = time.time() + + def consumer_task(): + """消费者任务:先启动,等待生产者生产数据""" + # 记录开始消费的时间 + consumer_start = time.time() + # 这里会阻塞等待,直到有至少2个样本可用 + samples = message_queue_client.get_samples(min_batch_count=3) + consumer_end = time.time() + consumer_result.append( + { + "success": True, + "samples_count": len(samples), + "wait_time": consumer_end - consumer_start, + "samples": samples, + } + ) + + def producer_task(): + """生产者任务:延迟1秒后开始生产""" + time.sleep(4.0) + producer_start = time.time() + message_queue_client.put_sample( + sample=mock_data_proto, + param_version=1, + ) + time.sleep(1) + message_queue_client.put_sample( + sample=mock_data_proto, + param_version=1, + ) + time.sleep(1) + message_queue_client.put_sample( + sample=mock_data_proto, + param_version=1, + ) + producer_end = time.time() + producer_result.append( + { + "put_count": 3, + "produce_time": producer_end - producer_start, + } + ) + + print("produce finish") + + # 启动消费者线程(先启动) + consumer_thread = threading.Thread(target=consumer_task, name="Consumer") + time.sleep(3) + # 启动生产者线程(后启动) + producer_thread = threading.Thread(target=producer_task, name="Producer") + + consumer_thread.start() + time.sleep(0.1) # 确保消费者先开始等待 + producer_thread.start() + + print("=========", flush=True) + # + # # 等待两个线程完成(设置超时避免死锁) + producer_thread.join() + print("producer_result", producer_result, flush=True) + consumer_thread.join() + print("consumer_result", consumer_result, flush=True) + + # 验证结果 + assert len(consumer_result) == 1, "消费者应该执行一次" + + consumer_data = consumer_result[0] + producer_data = producer_result[0] + + # 验证生产者成功 + assert producer_data["put_count"] == 3, "应该生产2批数据" + assert consumer_data["samples_count"] == 3, "消费者应该获取到2个样本" + + # 验证队列状态 + final_queue_size = message_queue_client.get_queue_size() + assert final_queue_size == 0, "队列应该被清空" + + stats = message_queue_client.get_statistics() + assert stats["total_produced"] == 3, "应该生产了2个样本" + assert stats["total_consumed"] == 3, "应该消费了2个样本" + # + + +# 运行测试的示例配置 +if __name__ == "__main__": + pytest.main([__file__, "-v", "--tb=short"]) diff --git a/tests/special_e2e/run_fully_async_policy.sh b/tests/special_e2e/run_fully_async_policy.sh index 2949316228a..50eb9070314 100644 --- a/tests/special_e2e/run_fully_async_policy.sh +++ b/tests/special_e2e/run_fully_async_policy.sh @@ -33,10 +33,10 @@ overlong_penalty_factor=1.0 # Training parameters loss_agg_mode="token-mean" -train_prompt_bsz=32 -gen_prompt_bsz=4 +train_prompt_bsz=2 +gen_prompt_bsz=2 n_resp_per_prompt=3 -train_prompt_mini_bsz=4 +train_prompt_mini_bsz=1 # Temperature parameters temperature=1.0 diff --git a/verl/trainer/ppo/ray_trainer.py b/verl/trainer/ppo/ray_trainer.py index 26150cc631d..9b87d5a3bd8 100644 --- a/verl/trainer/ppo/ray_trainer.py +++ b/verl/trainer/ppo/ray_trainer.py @@ -1237,6 +1237,7 @@ def _post_generate_batch(self, batch, gen_batch_output, metrics): def _process_batch_common(self, batch, metrics, timing_raw): with marked_timer("reward", timing_raw, color="yellow"): # compute reward model score + print("marked_timer reward") if self.use_rm: reward_tensor = self.rm_wg.compute_rm_score(batch) batch = batch.union(reward_tensor) @@ -1247,6 +1248,8 @@ def _process_batch_common(self, batch, metrics, timing_raw): reward_tensor, reward_extra_infos_dict = compute_reward(batch, self.reward_fn) # recompute old_log_probs with marked_timer("old_log_prob", timing_raw, color="blue"): + print("marked_timer rewold_log_prob") + old_log_prob = self.actor_rollout_wg.compute_log_prob(batch) entropys = old_log_prob.batch["entropys"] response_masks = batch.batch["response_mask"] @@ -1281,6 +1284,8 @@ def _process_batch_common(self, batch, metrics, timing_raw): } ) if self.use_reference_policy: + print("marked_timer use_reference_policy") + # compute reference log_prob with marked_timer("ref", timing_raw, color="olive"): if not self.ref_in_actor: @@ -1290,10 +1295,12 @@ def _process_batch_common(self, batch, metrics, timing_raw): batch = batch.union(ref_log_prob) # compute values if self.use_critic: + print("marked_timer compute use_critic") with marked_timer("values", timing_raw, color="cyan"): values = self.critic_wg.compute_values(batch) batch = batch.union(values) with marked_timer("adv", timing_raw, color="brown"): + print("marked_timer adv") # we combine with rule-based rm reward_extra_infos_dict: dict[str, list] if self.config.reward_model.launch_reward_fn_async: @@ -1329,6 +1336,7 @@ def _process_batch_common(self, batch, metrics, timing_raw): ) # update critic if self.use_critic: + print("marked_timer update use_critic") with marked_timer("update_critic", timing_raw, color="pink"): critic_output = self.critic_wg.update_critic(batch) critic_output_metrics = reduce_metrics(critic_output.meta_info["metrics"]) @@ -1336,6 +1344,7 @@ def _process_batch_common(self, batch, metrics, timing_raw): # implement critic warmup if self.config.trainer.critic_warmup <= self.global_steps: # update actor + print("marked_timer update_actor") with marked_timer("update_actor", timing_raw, color="red"): batch.meta_info["multi_turn"] = self.config.actor_rollout_ref.rollout.multi_turn.enable actor_output = self.actor_rollout_wg.update_actor(batch) From a8691b0971f84db33bda89186c38ff0b7e981d63 Mon Sep 17 00:00:00 2001 From: ArronHZG Date: Wed, 6 Aug 2025 19:18:20 +0800 Subject: [PATCH 028/182] train mutil step --- .../fully_async_rollouter.py | 1 - .../fully_async_policy/fully_async_trainer.py | 55 ++++++++++++++----- recipe/fully_async_policy/message_queue.py | 3 + 3 files changed, 45 insertions(+), 14 deletions(-) diff --git a/recipe/fully_async_policy/fully_async_rollouter.py b/recipe/fully_async_policy/fully_async_rollouter.py index 61b21b43fd5..b4ad9796294 100644 --- a/recipe/fully_async_policy/fully_async_rollouter.py +++ b/recipe/fully_async_policy/fully_async_rollouter.py @@ -370,7 +370,6 @@ def _should_pause_generation(self) -> bool: return True # 如果队列太满,也暂停生成 - if queue_size >= self.max_queue_size: print(f"Should pause due to full queue: size={queue_size}, max={self.max_queue_size}") return True diff --git a/recipe/fully_async_policy/fully_async_trainer.py b/recipe/fully_async_policy/fully_async_trainer.py index 9830aef595e..7d7a1130340 100644 --- a/recipe/fully_async_policy/fully_async_trainer.py +++ b/recipe/fully_async_policy/fully_async_trainer.py @@ -308,12 +308,12 @@ def fit(self): print(data=val_metrics, step=self.global_steps) if self.config.trainer.get("val_only", False): return - + # TODO 需要从 self.total_training_steps = self.config.trainer.total_training_steps print(f"Total training steps: {self.total_training_steps}") # add tqdm - progress_bar = tqdm(total=self.total_training_steps, initial=self.global_steps, desc="Training Progress") + # progress_bar = tqdm(total=self.total_training_steps, initial=self.global_steps, desc="Training Progress") # we start from step 1 self.global_steps += 1 @@ -324,6 +324,15 @@ def fit(self): # 初始化获取第一批数据 while True: print("while True", flush=True) + + # 检查队列状态 + if self.message_queue_client: + queue_stats = self.message_queue_client.get_statistics() + print(f"Queue status before getting samples: {queue_stats}") + + if queue_stats.get('queue_size', 0) == 0: + print("WARNING: Queue is empty, will block waiting for samples") + metrics = {} timing_raw = {} @@ -383,22 +392,42 @@ def fit(self): self._check_save_checkpoint(is_last_step, timing_raw) print("_stop_profiling") - self._stop_profiling(do_profile, timing_raw) + # self._stop_profiling(do_profile, timing_raw) print("_collect_metrics") - self._collect_metrics(batch, epoch, metrics, timing_raw) + # self._collect_metrics(batch, epoch, metrics, timing_raw) print("_post_batch_processing") - self._post_batch_processing(batch) + # self._post_batch_processing(batch) + + print("step end") + # + # # TODO: make a canonical logger that supports various backend + # print(data=metrics, step=self.global_steps) + # + # # progress_bar.update(1) + # self.global_steps += 1 + print("is_last_step") + # if is_last_step: + # pprint(f"Final validation metrics: {last_val_metrics}") + # print("is_last_step") + # # progress_bar.close() + # return + # + # + # # 检查队列状态 + # if self.message_queue_client: + # queue_stats = self.message_queue_client.get_statistics() + # print(f"Queue status before getting samples: {queue_stats}") + # + # if queue_stats.get('queue_size', 0) == 0: + # print("WARNING: Queue is empty, will block waiting for samples") + # + # with marked_timer("gen", timing_raw, color="red"): + # epoch, batch = self._get_samples_from_queue() + # if batch is None: + # break - # TODO: make a canonical logger that supports various backend - print(data=metrics, step=self.global_steps) - progress_bar.update(1) - self.global_steps += 1 - if is_last_step: - pprint(f"Final validation metrics: {last_val_metrics}") - progress_bar.close() - return def get_statistics(self) -> dict: """获取训练统计信息""" diff --git a/recipe/fully_async_policy/message_queue.py b/recipe/fully_async_policy/message_queue.py index c6116f0c432..e5c382dec2a 100644 --- a/recipe/fully_async_policy/message_queue.py +++ b/recipe/fully_async_policy/message_queue.py @@ -115,8 +115,11 @@ def get_samples(self, min_batch_count: int = 1) -> list[Any]: Returns: List[Any]: 获取的样本列表 """ + + print("get_samples") with self.lock: while len(self.queue) < min_batch_count and self.running: + print("consumer_condition") self.consumer_condition.wait() # 如果队列已关闭且没有足够样本,返回空列表 From ee8914ccdd25362072798815d116b984ed6f5131 Mon Sep 17 00:00:00 2001 From: ArronHZG Date: Fri, 8 Aug 2025 11:00:24 +0800 Subject: [PATCH 029/182] param_sync --- .../fully_async_policy/UNIFIED_PARAM_SYNC.md | 143 +++++++ recipe/fully_async_policy/fully_async_main.py | 156 +------ .../fully_async_rollouter.py | 5 + .../fully_async_policy/fully_async_trainer.py | 150 +++++-- recipe/fully_async_policy/param_sync.py | 392 ++---------------- 5 files changed, 330 insertions(+), 516 deletions(-) create mode 100644 recipe/fully_async_policy/UNIFIED_PARAM_SYNC.md diff --git a/recipe/fully_async_policy/UNIFIED_PARAM_SYNC.md b/recipe/fully_async_policy/UNIFIED_PARAM_SYNC.md new file mode 100644 index 00000000000..e816968f8fc --- /dev/null +++ b/recipe/fully_async_policy/UNIFIED_PARAM_SYNC.md @@ -0,0 +1,143 @@ +# 统一参数同步器使用指南 (Unified Parameter Synchronizer Guide) + +本文档说明了新的统一参数同步器 `UnifiedParameterSynchronizer` 的使用方法。该类合并了原有的多个同步器类的功能,提供了更简洁和统一的接口。 + +## 🏗️ 类合并说明 + +### 原有类结构(已合并) +- `ParameterSynchronizer` - 基础参数同步器 +- `ParameterSyncManager` - Ray Actor形式的参数同步管理器 +- `AsyncParameterSynchronizer` - 异步参数同步器 + +### 新的统一类 +- `UnifiedParameterSynchronizer` - 统一参数同步器,包含所有功能 + +## 🚀 使用方法 + +### 1. 异步训练模式(推荐) +```python +from recipe.fully_async_policy.param_sync import UnifiedParameterSynchronizer + +# 创建异步模式的参数同步器 +param_synchronizer = UnifiedParameterSynchronizer( + config=config, + trainer_actor=trainer_actor, + rollouter_actor=rollouter_actor +) + +# 同步参数到rollouter +success = param_synchronizer.sync_to_rollouter(new_version=1) +``` + +### 2. Ray Actor模式 +```python +from recipe.fully_async_policy.param_sync import ParameterSyncManager + +# 创建Ray remote参数同步管理器 +sync_manager = ParameterSyncManager.remote(config) + +# 注册workers +success = ray.get(sync_manager.register_workers.remote(actor_workers, rollout_workers)) + +# 执行同步 +success = ray.get(sync_manager.sync_parameters.remote()) +``` + +### 3. 传统模式 +```python +from recipe.fully_async_policy.param_sync import UnifiedParameterSynchronizer + +# 创建传统模式的参数同步器 +synchronizer = UnifiedParameterSynchronizer(config) + +# 初始化同步组 +success = synchronizer.initialize_sync_group(actor_workers, rollout_workers) + +# 同步权重 +success = synchronizer.sync_weights(actor_workers, rollout_workers) +``` + +## 🔄 向后兼容性 + +为了确保现有代码的兼容性,提供了以下别名: + +```python +# 这些别名指向 UnifiedParameterSynchronizer +ParameterSynchronizer = UnifiedParameterSynchronizer +AsyncParameterSynchronizer = UnifiedParameterSynchronizer + +# Ray remote版本 +ParameterSyncManager = ray.remote(UnifiedParameterSynchronizer) +``` + +现有代码无需修改即可使用新的统一同步器。 + +## ⚙️ 初始化参数 + +```python +def __init__(self, config, trainer_actor=None, rollouter_actor=None, as_ray_actor=False): +``` + +- `config`: 配置对象(必需) +- `trainer_actor`: trainer actor引用(用于async模式) +- `rollouter_actor`: rollouter actor引用(用于async模式) +- `as_ray_actor`: 是否作为Ray actor使用 + +## 📊 主要方法 + +### 异步模式 +- `sync_to_rollouter(new_version)`: 同步参数到rollouter +- `get_current_version()`: 获取当前参数版本 + +### Ray Actor模式 +- `register_workers(actor_workers, rollout_workers)`: 注册workers +- `sync_parameters()`: 执行参数同步 + +### 传统模式 +- `initialize_sync_group(actor_workers, rollout_workers)`: 初始化同步组 +- `sync_weights(actor_workers, rollout_workers)`: 同步权重 + +### 通用方法 +- `get_statistics()`: 获取统计信息 +- `get_weights_info()`: 获取权重信息 +- `cleanup()`: 清理资源 + +## 📈 统计信息 + +```python +stats = synchronizer.get_statistics() +# 返回: +{ + "sync_count": 15, + "sync_failures": 0, + "last_sync_time": 1640995200.0, + "sync_group_initialized": True, + "current_param_version": 15, + "current_version": 15, + "is_ready": True # 仅在Ray actor模式下 +} +``` + +## 🎯 优势 + +1. **统一接口**: 一个类支持所有同步模式 +2. **向后兼容**: 现有代码无需修改 +3. **灵活配置**: 支持多种初始化方式 +4. **完整功能**: 包含所有原有类的功能 +5. **简化维护**: 减少代码重复,便于维护 + +## 🔧 配置示例 + +```yaml +async_training: + max_sync_retries: 3 + sync_timeout: 30.0 + sync_retry_delay: 1.0 + sync_monitor_interval: 60.0 + staleness_threshold: 3 +``` + +--- + +*统一参数同步器简化了参数同步的使用,同时保持了所有原有功能的完整性。* + diff --git a/recipe/fully_async_policy/fully_async_main.py b/recipe/fully_async_policy/fully_async_main.py index 888c6c73594..aa5ac81f48a 100644 --- a/recipe/fully_async_policy/fully_async_main.py +++ b/recipe/fully_async_policy/fully_async_main.py @@ -204,6 +204,7 @@ def _initialize_components(self, config) -> None: self.components["tokenizer"] = tokenizer self.components["processor"] = processor + self.components["config"] = config # 保存config以供其他方法使用 # 创建worker映射和资源池 print("Creating worker mapping and resource pools...") @@ -244,14 +245,22 @@ def _initialize_components(self, config) -> None: self._create_trainer(config) # 设置参数同步 - # print("Setting up parameter synchronization...") - # param_synchronizer = AsyncParameterSynchronizer( - # config=config, - # actor_wg=self.components["trainer"].actor_wg, - # rollouter=self.components["rollouter"], - # ) - # self.components["param_synchronizer"] = param_synchronizer - # print("All components initialized successfully") + print("Setting up parameter synchronization...") + from recipe.fully_async_policy.param_sync import ParameterSynchronizer + + param_synchronizer = ParameterSynchronizer( + config=config, + actor_wg=self.components["trainer"], + rollout_wg=self.components["rollouter"], + ) + + # 将参数同步器设置到trainer和rollouter + ray.get(self.components["trainer"].set_parameter_synchronizer.remote(param_synchronizer)) + ray.get(self.components["rollouter"].set_parameter_synchronizer.remote(param_synchronizer)) + + self.components["param_synchronizer"] = param_synchronizer + print("Parameter synchronizer initialized successfully") + print("All components initialized successfully") def _create_rollouter(self, config) -> None: """创建Rollouter""" @@ -312,7 +321,6 @@ def _run_training_loop(self): print("Starting Rollouter in background...") rollouter_future = self.components["rollouter"].fit.remote() trainer_future = self.components["trainer"].fit.remote() - # self._monitor_components() print("Starting Trainer...") time.sleep(10) @@ -324,136 +332,6 @@ def _run_training_loop(self): print("Training completed or interrupted") - def _monitor_components(self): - """监控组件状态""" - print("Starting component monitoring...") - - last_stats_time = time.time() - stats_interval = 60.0 # 60秒报告一次统计 - - while self.running and not self.shutdown_event.is_set(): - try: - # 等待一段时间或直到收到停止信号 - if self.shutdown_event.wait(timeout=10.0): - break - - # 定期报告统计信息 - current_time = time.time() - if current_time - last_stats_time >= stats_interval: - self._log_component_statistics() - last_stats_time = current_time - - # 检查组件健康状态 - self._check_component_health() - - except Exception as e: - print(f"Error in component monitoring: {e}") - - print("Component monitoring stopped") - - def _log_component_statistics(self): - """记录组件统计信息""" - try: - # 获取Trainer统计 - trainer_stats = self.components["trainer"].get_statistics() - - # 获取Rollouter统计 - rollouter_stats = ray.get(self.components["rollouter"].get_statistics.remote(), timeout=5.0) - - # 获取队列统计 - queue_stats = self.components["message_queue_client"].get_statistics() - - print("=== Component Statistics ===") - print( - f"Trainer - Steps: {trainer_stats['global_steps']}, " - f"Samples: {trainer_stats['processed_samples']}, " - f"Param version: {trainer_stats['current_param_version']}" - ) - - print( - f"Rollouter - Generated: {rollouter_stats['total_generated_samples']}, " - f"Dropped: {rollouter_stats['dropped_stale_samples']}, " - f"Errors: {rollouter_stats['generation_errors']}" - ) - - print( - f"Queue - Size: {queue_stats['queue_size']}, " - f"Produced: {queue_stats['total_produced']}, " - f"Consumed: {queue_stats['total_consumed']}" - ) - - except Exception as e: - print(f"Error getting component statistics: {e}") - - def _check_component_health(self): - """检查组件健康状态""" - try: - # 检查trainer是否仍在运行 - if hasattr(self.components["trainer"], "global_steps"): - current_steps = self.components["trainer"].global_steps - # 可以添加更多健康检查逻辑 - print(current_steps) - - # 检查rollouter是否仍在运行 - rollouter_stats = ray.get(self.components["rollouter"].get_statistics.remote(), timeout=5.0) - - if not rollouter_stats["is_running"]: - print("Rollouter is not running!") - # 可以尝试重启或报告错误 - - except Exception as e: - print(f"Health check failed: {e}") - - def _cleanup_resources(self): - """清理资源""" - print("Cleaning up resources...") - - try: - # 停止Rollouter - if "rollouter" in self.components: - print("Shutting down Rollouter...") - try: - shutdown_future = self.components["rollouter"].shutdown.remote() - ray.get(shutdown_future, timeout=10.0) - except Exception as e: - print(f"Error shutting down Rollouter: {e}") - - # 清理MessageQueue - if "message_queue_client" in self.components: - print("Cleaning up MessageQueue...") - try: - self.components["message_queue_client"].shutdown() - except Exception as e: - print(f"Error cleaning up MessageQueue: {e}") - - # 清理参数同步器 - if "param_synchronizer" in self.components: - print("Cleaning up parameter synchronizer...") - # TODO: 添加参数同步器的清理逻辑 - - print("Resource cleanup completed") - - except Exception as e: - print(f"Error during cleanup: {e}") - - def get_training_status(self) -> dict: - """获取训练状态""" - if not self.running or "trainer" not in self.components: - return {"status": "not_running"} - - try: - trainer_stats = self.components["trainer"].get_statistics() - rollouter_stats = ray.get(self.components["rollouter"].get_statistics.remote(), timeout=5.0) - - return { - "status": "running", - "trainer_stats": trainer_stats, - "rollouter_stats": rollouter_stats, - } - except Exception as e: - print(f"Error getting training status: {e}") - return {"status": "error", "error": str(e)} - @hydra.main(config_path="config", config_name="fully_async_ppo_trainer", version_base=None) def main(config): diff --git a/recipe/fully_async_policy/fully_async_rollouter.py b/recipe/fully_async_policy/fully_async_rollouter.py index b4ad9796294..c760215c580 100644 --- a/recipe/fully_async_policy/fully_async_rollouter.py +++ b/recipe/fully_async_policy/fully_async_rollouter.py @@ -138,6 +138,11 @@ def __init__( self.sync_in_progress = False self.sync_lock = threading.Lock() + # 参数同步状态 - 基于one_step_off_policy模式 + self._weights_info = None + self._is_rollout = True # rollouter是rollout角色 + self._is_actor = False + self.max_queue_size = max_queue_size def set_message_queue_client(self, message_queue_client: MessageQueueClient): diff --git a/recipe/fully_async_policy/fully_async_trainer.py b/recipe/fully_async_policy/fully_async_trainer.py index 7d7a1130340..20bae1a6a64 100644 --- a/recipe/fully_async_policy/fully_async_trainer.py +++ b/recipe/fully_async_policy/fully_async_trainer.py @@ -22,7 +22,6 @@ import numpy as np import ray from omegaconf import OmegaConf -from tqdm import tqdm from recipe.fully_async_policy.message_queue import MessageQueueClient, QueueSample from verl.single_controller.ray import RayClassWithInitArgs, RayWorkerGroup @@ -127,6 +126,11 @@ def __init__( self.current_param_version = 0 self.param_sync_count = 0 + # 参数同步相关状态 + self._weights_info = None + self._is_actor = False # 将在init_worker_group中设置 + self._is_rollout = False + def set_message_queue_client(self, message_queue_client: MessageQueueClient): """设置消息队列客户端""" with self.lock: @@ -137,6 +141,60 @@ def set_parameter_synchronizer(self, param_synchronizer): with self.lock: self.param_synchronizer = param_synchronizer + def _get_actor_params(self): + """ + 获取actor参数 - 基于one_step_off_policy的实现 + """ + if not hasattr(self, "actor_wg") or self.actor_wg is None: + raise ValueError("Actor worker group not initialized") + + # 从actor worker group获取参数 + actor_workers = self.actor_wg.workers + if not actor_workers: + raise ValueError("No actor workers available") + + # 获取第一个actor worker的参数信息 + params_future = actor_workers[0]._get_actor_params.remote() + params = ray.get(params_future, timeout=10.0) + return params + + def get_actor_weights_info(self): + """ + 获取actor权重信息 - 基于one_step_off_policy的模式 + """ + if hasattr(self, "_weights_info") and self._weights_info is not None: + return self._weights_info + + if not hasattr(self, "actor_wg") or self.actor_wg is None: + raise ValueError("Actor worker group not initialized") + + # 从actor worker group获取权重信息 + weights_info_future = self.actor_wg.get_actor_weights_info.remote() + weights_info = ray.get(weights_info_future, timeout=10.0) + + # 缓存权重信息 + self._weights_info = weights_info[0] if isinstance(weights_info, list) else weights_info + return self._weights_info + + def sync_rollout_weights(self): + """ + 同步rollout权重 - Actor端的同步操作 + """ + if not hasattr(self, "actor_wg") or self.actor_wg is None: + logger.warning("Actor worker group not initialized for sync") + return False + + try: + # 触发actor worker group的参数同步 + sync_future = self.actor_wg.sync_rollout_weights.remote() + ray.get(sync_future, timeout=30.0) + logger.debug("Actor weights sync completed") + return True + + except Exception as e: + logger.error(f"Failed to sync actor weights: {e}") + return False + def _get_samples_from_queue(self) -> tuple[None, None] | tuple[int, Any]: """ 从消息队列获取样本并组成gen_batch_output @@ -287,7 +345,7 @@ def fit(self): from verl.utils.tracking import Tracking - logger = Tracking( + self.logger = Tracking( project_name=self.config.trainer.project_name, experiment_name=self.config.trainer.experiment_name, default_backend=self.config.trainer.logger, @@ -330,7 +388,7 @@ def fit(self): queue_stats = self.message_queue_client.get_statistics() print(f"Queue status before getting samples: {queue_stats}") - if queue_stats.get('queue_size', 0) == 0: + if queue_stats.get("queue_size", 0) == 0: print("WARNING: Queue is empty, will block waiting for samples") metrics = {} @@ -399,35 +457,22 @@ def fit(self): # self._post_batch_processing(batch) print("step end") - # - # # TODO: make a canonical logger that supports various backend - # print(data=metrics, step=self.global_steps) - # - # # progress_bar.update(1) - # self.global_steps += 1 + + # 在训练步骤结束后触发参数同步 + self._trigger_parameter_sync_after_step() + + # TODO: make a canonical logger that supports various backend + print(data=metrics, step=self.global_steps) + + # progress_bar.update(1) + self.global_steps += 1 print("is_last_step") # if is_last_step: # pprint(f"Final validation metrics: {last_val_metrics}") # print("is_last_step") # # progress_bar.close() # return - # - # - # # 检查队列状态 - # if self.message_queue_client: - # queue_stats = self.message_queue_client.get_statistics() - # print(f"Queue status before getting samples: {queue_stats}") - # - # if queue_stats.get('queue_size', 0) == 0: - # print("WARNING: Queue is empty, will block waiting for samples") - # - # with marked_timer("gen", timing_raw, color="red"): - # epoch, batch = self._get_samples_from_queue() - # if batch is None: - # break - - - + ray.get(self.param_synchronizer.sync_weights.remote(self.global_steps)) def get_statistics(self) -> dict: """获取训练统计信息""" @@ -444,6 +489,59 @@ def get_statistics(self) -> dict: "queue_dropped_samples": queue_stats.get("dropped_samples", 0), } + def _trigger_parameter_sync_after_step(self): + """ + 在训练步骤结束后触发参数同步 + 这确保rollouter总是使用最新训练的参数 + """ + if not self.param_synchronizer: + logger.debug("No parameter synchronizer available, skipping sync") + return + + try: + # 更新参数版本号 + new_version = self.current_param_version + 1 + + print( + f"[TRAINER] Triggering parameter sync after training step {self.global_steps}, version: {new_version}" + ) + logger.info(f"Triggering parameter sync after training step {self.global_steps}, version: {new_version}") + + # 异步触发参数同步,不阻塞训练流程 + import threading + + sync_thread = threading.Thread(target=self._async_parameter_sync, args=(new_version,), daemon=True) + sync_thread.start() + + except Exception as e: + logger.error(f"Error triggering parameter sync: {e}") + + def _async_parameter_sync(self, new_version: int): + """ + 异步执行参数同步,避免阻塞训练流程 + + Args: + new_version: 新的参数版本号 + """ + try: + # 执行参数同步 + success = self.param_synchronizer.sync_to_rollouter(new_version) + + if success: + # 更新本地参数版本 + with self.lock: + self.current_param_version = new_version + self.param_sync_count += 1 + + print(f"[TRAINER] Parameter sync completed successfully for version {new_version}") + logger.info(f"Parameter sync completed successfully for version {new_version}") + else: + print(f"[TRAINER] Parameter sync failed for version {new_version}") + logger.warning(f"Parameter sync failed for version {new_version}") + + except Exception as e: + logger.error(f"Error in async parameter sync: {e}") + def update_param_version(self, param_version: int) -> bool: """ 更新trainer的参数版本,用于跟踪与rollouter的参数同步状态 diff --git a/recipe/fully_async_policy/param_sync.py b/recipe/fully_async_policy/param_sync.py index 023475ef777..10843302786 100644 --- a/recipe/fully_async_policy/param_sync.py +++ b/recipe/fully_async_policy/param_sync.py @@ -13,7 +13,6 @@ # limitations under the License. import logging -import time import ray from ray.util.collective import collective @@ -21,372 +20,63 @@ logger = logging.getLogger(__name__) -class ParameterSynchronizer: - """ - 参数同步器,负责在actor和rollout之间同步模型参数 - 改进版本,具有更好的错误处理和重试机制 - """ - - def __init__(self, config): - self.config = config - self.weights_info = None - self.sync_group_initialized = False - self.sync_group_name = "actor_rollout" - - # 同步配置 - self.max_sync_retries = config.async_training.get("max_sync_retries", 3) - self.sync_timeout = config.async_training.get("sync_timeout", 30.0) - self.retry_delay = config.async_training.get("sync_retry_delay", 1.0) - - # 统计信息 - self.sync_count = 0 - self.sync_failures = 0 - self.last_sync_time = 0 - - def initialize_sync_group(self, actor_workers: list, rollout_workers: list) -> bool: - """ - 初始化参数同步组 - - Args: - actor_workers: actor worker列表 - rollout_workers: rollout worker列表 - - Returns: - bool: 是否成功初始化 - """ - logger.info("Initializing parameter synchronization group...") - - try: - # 验证workers - if not actor_workers: - raise ValueError("No actor workers provided") - if not rollout_workers: - raise ValueError("No rollout workers provided") - - # 获取actor的权重信息 - logger.debug("Getting actor weights info...") - weights_info_future = actor_workers[0].get_actor_weights_info.remote() - self.weights_info = ray.get(weights_info_future, timeout=10.0)[0] - - if not self.weights_info: - raise ValueError("Failed to get actor weights info") - - # 设置rollout的权重信息 - logger.debug("Setting rollout weights info...") - set_weights_futures = [] - for rollout_worker in rollout_workers: - future = rollout_worker.set_actor_weights_info.remote(self.weights_info) - set_weights_futures.append(future) - - ray.get(set_weights_futures, timeout=10.0) - - # 创建actor-rollout通信组 - logger.debug("Creating collective communication group...") - all_workers = actor_workers + rollout_workers - - # 清理可能存在的旧组 - try: - collective.destroy_collective_group(self.sync_group_name) - except Exception: - pass # 忽略清理错误 - - collective.create_collective_group( - all_workers, - len(all_workers), - list(range(0, len(all_workers))), - backend="nccl", - group_name=self.sync_group_name, - ) - - self.sync_group_initialized = True - logger.info("Parameter synchronization group initialized successfully") - return True - - except Exception as e: - logger.error(f"Failed to initialize sync group: {e}") - self.sync_group_initialized = False - return False - - def sync_weights(self, actor_workers: list, rollout_workers: list) -> bool: - """ - 同步权重从actor到rollout - 改进版本,具有重试和错误处理 - - Args: - actor_workers: actor worker列表 - rollout_workers: rollout worker列表 - - Returns: - bool: 是否同步成功 - """ - if not self.sync_group_initialized: - logger.error("Sync group not initialized. Call initialize_sync_group() first.") - return False - - logger.debug("Starting weight synchronization...") - start_time = time.time() - - for attempt in range(self.max_sync_retries): - try: - # 执行同步 - success = self._execute_sync(actor_workers, rollout_workers) - - if success: - self.sync_count += 1 - self.last_sync_time = time.time() - sync_duration = self.last_sync_time - start_time - logger.debug(f"Weight synchronization completed in {sync_duration:.2f}s") - return True - else: - logger.warning(f"Sync attempt {attempt + 1} failed") - - except Exception as e: - logger.warning(f"Sync attempt {attempt + 1} failed with error: {e}") - - # 如果不是最后一次尝试,等待后重试 - if attempt < self.max_sync_retries - 1: - logger.info(f"Retrying sync in {self.retry_delay}s...") - time.sleep(self.retry_delay) - - # 所有重试都失败 - self.sync_failures += 1 - logger.error(f"All sync attempts failed. Total failures: {self.sync_failures}") - return False - - def _execute_sync(self, actor_workers: list, rollout_workers: list) -> bool: - """ - 执行实际的同步操作 - - Args: - actor_workers: actor worker列表 - rollout_workers: rollout worker列表 - - Returns: - bool: 是否同步成功 - """ - try: - sync_futures = [] - - # Actor端同步 - for actor_worker in actor_workers: - future = actor_worker.sync_rollout_weights.remote() - sync_futures.append(future) - - # Rollout端同步 - for rollout_worker in rollout_workers: - future = rollout_worker.sync_rollout_weights.remote() - sync_futures.append(future) - - # 等待所有同步完成,带超时 - ray.get(sync_futures, timeout=self.sync_timeout) - return True - - except Exception as e: - logger.error(f"Sync execution failed: {e}") - return False - - def cleanup(self): - """清理同步组""" - if self.sync_group_initialized: - try: - collective.destroy_collective_group(self.sync_group_name) - logger.info("Sync group cleaned up") - except Exception as e: - logger.warning(f"Error cleaning up sync group: {e}") - finally: - self.sync_group_initialized = False - - def get_statistics(self) -> dict: - """获取同步统计信息""" - return { - "sync_count": self.sync_count, - "sync_failures": self.sync_failures, - "last_sync_time": self.last_sync_time, - "sync_group_initialized": self.sync_group_initialized, - } - - @ray.remote -class ParameterSyncManager: +class ParameterSynchronizer: """ - Ray Actor形式的参数同步管理器 - 改进版本 + 统一的参数同步器,负责在actor和rollout之间同步模型参数 + 基于one_step_off_policy的成熟同步模式实现 + 合并了原有的多个同步器类的功能 """ - def __init__(self, config): - self.config = config - self.synchronizer = ParameterSynchronizer(config) - self.actor_workers = [] - self.rollout_workers = [] - self.is_ready = False - - def register_workers(self, actor_workers: list, rollout_workers: list) -> bool: - """ - 注册worker - - Args: - actor_workers: actor worker列表 - rollout_workers: rollout worker列表 - - Returns: - bool: 是否成功注册 - """ - try: - self.actor_workers = actor_workers - self.rollout_workers = rollout_workers - - # 初始化同步组 - success = self.synchronizer.initialize_sync_group(actor_workers, rollout_workers) - self.is_ready = success - - if success: - logger.info("ParameterSyncManager ready") - else: - logger.error("ParameterSyncManager initialization failed") - - return success - except Exception as e: - logger.error(f"Failed to register workers: {e}") - return False - - def sync_parameters(self) -> bool: - """ - 执行参数同步 - - Returns: - bool: 是否同步成功 + def __init__(self, config, actor_wg, rollout_wg): """ - if not self.is_ready: - logger.error("SyncManager not ready. Call register_workers() first.") - return False - - return self.synchronizer.sync_weights(self.actor_workers, self.rollout_workers) - - def get_weights_info(self): - """获取权重信息""" - return self.synchronizer.weights_info - - def get_statistics(self) -> dict: - """获取统计信息""" - stats = self.synchronizer.get_statistics() - stats["is_ready"] = self.is_ready - return stats - - def cleanup(self): - """清理资源""" - self.synchronizer.cleanup() - self.is_ready = False + 初始化统一参数同步器 - -class AsyncParameterSynchronizer: - """ - 异步参数同步器,用于完全异步训练工作流 - 改进版本 - """ - - def __init__(self, config, actor_wg, rollouter_actor): - """ Args: - config: 配置 - actor_wg: actor worker group - rollouter_actor: rollouter actor引用 + config: 配置对象 + actor_wg: trainer actor引用(用于async模式) + rollout_wg: rollouter actor引用(用于async模式) """ self.config = config self.actor_wg = actor_wg - self.rollouter_actor = rollouter_actor - self.current_version = 0 + self.rollout_wg = rollout_wg - # 同步配置 - self.sync_timeout = config.async_training.get("sync_timeout", 30.0) - self.max_sync_retries = config.async_training.get("max_sync_retries", 3) - self.retry_delay = config.async_training.get("sync_retry_delay", 1.0) + # 基础属性 + self.weights_info = None + self.sync_group_initialized = False + self.sync_group_name = "actor_rollout" # 统计信息 - self.sync_count = 0 - self.sync_failures = 0 - self.last_sync_time = 0 + self.current_version = 0 - # 初始化同步组 + self._init_weights_info() self._init_sync_group() - def _init_sync_group(self): - """初始化同步组""" - try: - # 获取actor权重信息 - weights_info = self.actor_wg.get_actor_weights_info()[0] - - # 通知rollouter设置权重信息 - ray.get(self.rollouter_actor.set_weights_info.remote(weights_info), timeout=10.0) - - # 创建同步通信组 - actor_workers = self.actor_wg.workers - rollout_workers = ray.get(self.rollouter_actor.get_rollout_workers.remote(), timeout=10.0) - - all_workers = actor_workers + rollout_workers - collective.create_collective_group( - all_workers, - len(all_workers), - list(range(0, len(all_workers))), - backend="nccl", - group_name="async_actor_rollout", - ) - - logger.info("Async parameter synchronizer initialized") - - except Exception as e: - logger.warning(f"Failed to initialize async sync group: {e}") - - def sync_to_rollouter(self, new_version: int) -> bool: - """ - 将actor参数同步到rollouter - 改进版本,具有重试机制 - - Args: - new_version: 新的参数版本号 - - Returns: - bool: 是否同步成功 - """ - logger.info(f"Syncing parameters to rollouter, version: {new_version}") - start_time = time.time() - - for attempt in range(self.max_sync_retries): - try: - # 首先同步actor到rollout worker group - self.actor_wg.sync_rollout_weights() - - # 然后通知rollouter更新参数版本 - sync_future = self.rollouter_actor.update_rollout_weights.remote(new_version) - sync_result = ray.get(sync_future, timeout=self.sync_timeout) - - if sync_result: - self.current_version = new_version - self.sync_count += 1 - self.last_sync_time = time.time() - sync_duration = self.last_sync_time - start_time - logger.info(f"Parameter sync completed in {sync_duration:.2f}s, version: {new_version}") - return True - else: - logger.warning(f"Rollouter rejected sync for version {new_version}") - - except Exception as e: - logger.warning(f"Sync attempt {attempt + 1} failed: {e}") - - # 如果不是最后一次尝试,等待后重试 - if attempt < self.max_sync_retries - 1: - logger.info(f"Retrying sync in {self.retry_delay}s...") - time.sleep(self.retry_delay) + def get_current_param_version(self) -> int: + """获取当前参数版本号""" + return self.current_version - # 所有重试都失败 - self.sync_failures += 1 - logger.error(f"Failed to sync parameters to rollouter after {self.max_sync_retries} attempts") - return False + def get_weights_info(self): + """获取权重信息""" + return self.weights_info - def get_current_version(self) -> int: - """获取当前参数版本""" - return self.current_version + def _init_weights_info(self): + self.weights_info = self.actor_wg.get_actor_weights_info()[0] + self.rollout_wg.set_actor_weights_info(self.weights_info) - def get_statistics(self) -> dict: - """获取统计信息""" - return { - "current_version": self.current_version, - "sync_count": self.sync_count, - "sync_failures": self.sync_failures, - "last_sync_time": self.last_sync_time, - } + def _init_sync_group(self): + print("Initializing parameter synchronization group...") + actor_rollout_workers = self.actor_wg.workers + self.rollout_wg.workers + collective.create_collective_group( + actor_rollout_workers, + len(actor_rollout_workers), + list(range(0, len(actor_rollout_workers))), + backend="nccl", + group_name=self.sync_group_name, + ) + + def sync_weights(self, version): + self.current_version = version + logger.debug(f"Starting weight synchronization (version {self.current_version})...") + self.actor_wg.sync_rollout_weights() + ray.get(self.rollout_wg.sync_rollout_weights()) From 75fe2af1a35313ccb2eaf5d2d9544d8b3c0e3c0b Mon Sep 17 00:00:00 2001 From: ArronHZG Date: Fri, 8 Aug 2025 11:45:48 +0800 Subject: [PATCH 030/182] ParameterSynchronizer --- .../README_async_trainer.md | 92 ----- recipe/fully_async_policy/TEST_GUIDE.md | 313 ------------------ .../fully_async_policy/UNIFIED_PARAM_SYNC.md | 143 -------- recipe/fully_async_policy/fully_async_main.py | 66 +++- .../fully_async_rollouter.py | 4 + .../fully_async_policy/fully_async_trainer.py | 190 ++--------- recipe/fully_async_policy/param_sync.py | 21 +- recipe/fully_async_policy/run_benchmark.sh | 307 ----------------- .../run_fully_async_example.sh | 147 -------- .../{ => unittest}/test_components_pytest.py | 0 10 files changed, 100 insertions(+), 1183 deletions(-) delete mode 100644 recipe/fully_async_policy/README_async_trainer.md delete mode 100644 recipe/fully_async_policy/TEST_GUIDE.md delete mode 100644 recipe/fully_async_policy/UNIFIED_PARAM_SYNC.md delete mode 100755 recipe/fully_async_policy/run_benchmark.sh delete mode 100644 recipe/fully_async_policy/run_fully_async_example.sh rename recipe/fully_async_policy/{ => unittest}/test_components_pytest.py (100%) diff --git a/recipe/fully_async_policy/README_async_trainer.md b/recipe/fully_async_policy/README_async_trainer.md deleted file mode 100644 index 9fbaa336be6..00000000000 --- a/recipe/fully_async_policy/README_async_trainer.md +++ /dev/null @@ -1,92 +0,0 @@ -# FullyAsyncTrainer 队列数据获取实现 - -## 概述 - -本实现为 `FullyAsyncTrainer` 类添加了从消息队列获取样本并组成 `gen_batch_output` 的功能,实现了完全异步的训练流程。 - -## 核心功能 - -### 1. 样本计算逻辑 - -```python -# 计算需要获取的样本数量 -n_responses_per_prompt = self.config.actor_rollout_ref.rollout.n -batch_size = self.config.data.train_batch_size -required_samples = n_responses_per_prompt * batch_size -``` - -训练器会根据配置自动计算需要从队列获取的样本数量: -- `rollout.n`: 每个prompt生成的响应数量 -- `train_batch_size`: 训练批次大小 -- 总样本数 = n × batch_size - -### 2. 主要方法 - -#### `_get_samples_from_queue()` -- 从消息队列获取指定数量的样本 -- 组装成 `gen_batch_output` 格式 -- 提取原始batch信息构造 `batch_dict` - -#### `_assemble_gen_batch_output_from_queue_samples()` -- 将队列中的多个样本重新组装成 `DataProto` 对象 -- 处理tensor和non-tensor数据 -- 合并timing信息和metadata - -#### `_extract_batch_dict_from_sample()` -- 从样本数据中提取原始输入信息 -- 过滤掉生成的输出,保留prompt相关数据 - -#### `_async_get_next_batch_from_queue()` -- 异步获取下一批队列数据 -- 使用线程池实现非阻塞操作 - -### 3. 数据流程 - -1. **样本生成**: Rollouter生成样本并放入MessageQueue -2. **样本获取**: Trainer从队列异步获取 `n × batch_size` 个样本 -3. **数据重组**: 将队列样本重新组装成标准的 `gen_batch_output` 格式 -4. **训练处理**: 样本进入标准的PPO训练流程 - -### 4. 使用示例 - -```python -# 初始化trainer -trainer = FullyAsyncTrainer(config, tokenizer, role_worker_mapping, resource_pool_manager) - -# 设置消息队列客户端 -trainer.set_message_queue_client(message_queue_client) - -# 开始训练(自动从队列获取数据) -trainer.fit() -``` - -## 配置要求 - -确保配置中包含以下参数: - -```yaml -data: - train_batch_size: 128 # 训练批次大小 - -actor_rollout_ref: - rollout: - n: 4 # 每个prompt的响应数量 -``` - -## 特性 - -- **异步处理**: 使用异步方式从队列获取数据,不阻塞训练流程 -- **数据完整性**: 保持原有的tensor和non-tensor数据结构 -- **元数据保留**: 保留timing、参数版本等重要信息 -- **兼容性**: 与现有的PPO训练流程完全兼容 - -## 监控指标 - -训练器提供以下统计指标: -- `queue_sample_count`: 当前批次的样本数量 -- `rollout_param_versions`: 样本对应的参数版本 -- `sample_timestamps`: 样本生成时间戳 -- timing信息的平均值 - -通过 `trainer.get_statistics()` 可以获取详细的训练统计信息。 - diff --git a/recipe/fully_async_policy/TEST_GUIDE.md b/recipe/fully_async_policy/TEST_GUIDE.md deleted file mode 100644 index 3933998cd84..00000000000 --- a/recipe/fully_async_policy/TEST_GUIDE.md +++ /dev/null @@ -1,313 +0,0 @@ -# Fully Async Policy 测试指南 - -本文档介绍如何测试完全异步PPO训练系统的各种功能和性能。 - -## 📋 测试概览 - -我们提供了多种类型的测试,涵盖从单元测试到端到端测试的完整测试套件: - -### 测试类型 -1. **单元测试** - 测试各个组件的独立功能 -2. **集成测试** - 测试组件间的协作 -3. **端到端测试** - 测试完整的训练流程 -4. **性能基准测试** - 评估系统性能特征 -5. **压力测试** - 测试系统在极限条件下的表现 - -## 🚀 快速开始 - -### 1. 端到端测试 -最简单的方式是运行端到端测试,验证系统基本功能: - -```bash -# 基本E2E测试 -./run_e2e_test.sh - -# 使用环境变量自定义配置 -NUM_GPUS=4 MODEL_ID=Qwen/Qwen2.5-0.5B-Instruct ./run_e2e_test.sh -``` - -### 2. 单元测试 -运行组件级别的单元测试: - -```bash -# 运行所有单元测试 -cd unittest/ -python test_fully_async_components.py - -# 或者使用pytest(如果安装) -pytest test_components_pytest.py -v -``` - -### 3. 性能基准测试 -评估系统性能特征: - -```bash -# 运行完整的性能基准测试 -./run_benchmark.sh - -# 自定义GPU数量和策略 -NUM_GPUS=8 ACTOR_STRATEGY=fsdp2 ./run_benchmark.sh -``` - -## 📊 测试脚本详解 - -### run_e2e_test.sh -- **目的**: 端到端功能验证 -- **配置**: 最小化配置,快速验证基本功能 -- **时长**: 约5-10分钟 -- **用法**: `./run_e2e_test.sh` - -**环境变量**: -- `NUM_GPUS`: GPU数量 (默认: 4) -- `MODEL_ID`: 使用的模型ID (默认: Qwen/Qwen2.5-0.5B-Instruct) -- `MODEL_PATH`: 模型存储路径 - -### run_benchmark.sh -- **目的**: 性能基准测试 -- **配置**: 多种配置组合,评估性能影响 -- **时长**: 约30-60分钟 -- **用法**: `./run_benchmark.sh` - -**测试覆盖**: -1. 不同新鲜度阈值的影响 -2. 不同队列大小的性能表现 -3. 生成间隔对吞吐量的影响 -4. GPU资源分配的优化 -5. 暂停/恢复功能测试 - -### test_fully_async_components.py -- **目的**: 单元和集成测试 -- **配置**: 使用Mock对象的孤立测试 -- **时长**: 约2-5分钟 -- **用法**: `python unittest/test_fully_async_components.py` - -**测试覆盖**: -- MessageQueue的基本功能 -- 参数同步器的重试机制 -- Rollouter的暂停/恢复 -- 新鲜度指标计算 -- 错误处理和超时机制 - -## 🔧 测试配置 - -### 最小化测试配置 -用于快速验证功能: - -```yaml -# 基本配置 -data: - train_batch_size: 4 - max_prompt_length: 512 - max_response_length: 1024 - -trainer: - total_training_steps: 2 - n_gpus_per_node: 2 - -rollout: - n_gpus_per_node: 2 - -async_training: - staleness_threshold: 3 - max_queue_size: 100 -``` - -### 性能测试配置 -用于评估系统性能: - -```yaml -# 性能配置 -data: - train_batch_size: 16 - max_prompt_length: 512 - max_response_length: 1024 - -trainer: - total_training_steps: 10 - n_gpus_per_node: 6 - -rollout: - n_gpus_per_node: 2 - -async_training: - staleness_threshold: 3 - max_queue_size: 1000 - generation_timeout: 30.0 -``` - -## 📈 测试结果分析 - -### 成功指标 -测试成功应满足以下条件: - -1. **功能正确性**: - - 样本成功生成和消费 - - 参数同步正常工作 - - 暂停/恢复功能响应 - -2. **性能表现**: - - 样本生成速率 > 目标吞吐量 - - 队列利用率在合理范围(50-80%) - - 新鲜度指标符合预期 - -3. **稳定性**: - - 无内存泄漏 - - 无死锁或竞争条件 - - 优雅处理错误情况 - -### 失败排查 -常见问题及解决方案: - -1. **Ray连接失败**: - ```bash - # 重新初始化Ray - ray stop - ray start --head - ``` - -2. **GPU内存不足**: - ```bash - # 减少批大小或使用梯度检查点 - data.train_batch_size=2 - actor_rollout_ref.model.enable_gradient_checkpointing=True - ``` - -3. **队列阻塞**: - ```bash - # 调整队列大小和新鲜度阈值 - async_training.max_queue_size=500 - async_training.staleness_threshold=5 - ``` - -## 🎯 特定功能测试 - -### 测试暂停/恢复功能 -```python -# 在Python脚本中测试 -import ray -from fully_async_rollouter import FullyAsyncRollouter - -rollouter = FullyAsyncRollouter.remote(config, ...) - -# 测试暂停 -result = ray.get(rollouter.pause_rollout.remote()) -assert result == True - -# 测试恢复 -result = ray.get(rollouter.resume_rollout.remote()) -assert result == True -``` - -### 测试新鲜度控制 - -```python -# 测试样本过期机制 -queue = MessageQueueClient.remote(max_staleness=3) - -# 放入旧版本样本 -queue.put_sample.remote(sample, param_version=1) - -# 用新版本获取(应该被拒绝) -result = ray.get(queue.get_samples.remote(current_param_version=5)) -assert result is None -``` - -### 测试参数同步 -```python -# 测试同步重试机制 -sync = ParameterSynchronizer.remote(config, actor_wg, rollout_wg) - -# 测试成功同步 -result = ray.get(sync.sync_weights.remote()) -assert result == True -``` - -## 📝 测试报告 - -### 基准测试报告 -运行`./run_benchmark.sh`后,会在`benchmark_results_*/`目录下生成: - -- `performance_report.md` - 详细的性能报告 -- `summary.txt` - 关键指标摘要 -- `*.log` - 各项测试的详细日志 - -### 关键指标 -需要关注的性能指标: - -1. **吞吐量指标**: - - 样本生成速率 (samples/second) - - 训练步数完成速率 (steps/second) - -2. **延迟指标**: - - 样本平均年龄 (average sample age) - - 参数同步延迟 (sync latency) - -3. **资源利用率**: - - GPU利用率 (GPU utilization) - - 内存使用量 (memory usage) - - 队列利用率 (queue utilization) - -## 🔄 CI/CD 集成 - -### GitHub Actions 示例 -```yaml -name: Fully Async Policy Tests -on: [push, pull_request] - -jobs: - test: - runs-on: ubuntu-latest - steps: - - uses: actions/checkout@v2 - - name: Setup Python - uses: actions/setup-python@v2 - with: - python-version: 3.9 - - - name: Install dependencies - run: | - pip install -r requirements.txt - pip install pytest - - - name: Run unit tests - run: | - cd recipe/fully_async_policy/unittest/ - python test_fully_async_components.py - - - name: Run E2E test (if GPUs available) - run: | - if nvidia-smi; then - cd recipe/fully_async_policy/ - ./run_e2e_test.sh - fi -``` - -## 🛠️ 开发者测试 - -### 添加新测试 -1. **单元测试**: 在`unittest/test_fully_async_components.py`中添加新的测试类 -2. **集成测试**: 在相应的集成测试类中添加新方法 -3. **性能测试**: 在`run_benchmark.sh`中添加新的基准测试场景 - -### 测试最佳实践 -1. **隔离性**: 每个测试应该独立,不依赖其他测试 -2. **可重现性**: 使用固定的随机种子和确定性配置 -3. **清理**: 测试结束后清理资源,避免影响后续测试 -4. **文档**: 为新测试添加清晰的文档说明 - -## ❓ 常见问题 - -**Q: 测试失败,提示Ray连接错误** -A: 确保Ray集群正常运行,或重新启动Ray - -**Q: 内存不足错误** -A: 减少批大小或在测试配置中启用参数卸载 - -**Q: 测试运行时间过长** -A: 使用更小的模型或减少训练步数进行快速测试 - -**Q: 如何添加自定义测试?** -A: 参考现有测试模式,在对应的测试文件中添加新的测试方法 - -通过这套完整的测试系统,可以确保fully async policy系统的可靠性、性能和稳定性。 - diff --git a/recipe/fully_async_policy/UNIFIED_PARAM_SYNC.md b/recipe/fully_async_policy/UNIFIED_PARAM_SYNC.md deleted file mode 100644 index e816968f8fc..00000000000 --- a/recipe/fully_async_policy/UNIFIED_PARAM_SYNC.md +++ /dev/null @@ -1,143 +0,0 @@ -# 统一参数同步器使用指南 (Unified Parameter Synchronizer Guide) - -本文档说明了新的统一参数同步器 `UnifiedParameterSynchronizer` 的使用方法。该类合并了原有的多个同步器类的功能,提供了更简洁和统一的接口。 - -## 🏗️ 类合并说明 - -### 原有类结构(已合并) -- `ParameterSynchronizer` - 基础参数同步器 -- `ParameterSyncManager` - Ray Actor形式的参数同步管理器 -- `AsyncParameterSynchronizer` - 异步参数同步器 - -### 新的统一类 -- `UnifiedParameterSynchronizer` - 统一参数同步器,包含所有功能 - -## 🚀 使用方法 - -### 1. 异步训练模式(推荐) -```python -from recipe.fully_async_policy.param_sync import UnifiedParameterSynchronizer - -# 创建异步模式的参数同步器 -param_synchronizer = UnifiedParameterSynchronizer( - config=config, - trainer_actor=trainer_actor, - rollouter_actor=rollouter_actor -) - -# 同步参数到rollouter -success = param_synchronizer.sync_to_rollouter(new_version=1) -``` - -### 2. Ray Actor模式 -```python -from recipe.fully_async_policy.param_sync import ParameterSyncManager - -# 创建Ray remote参数同步管理器 -sync_manager = ParameterSyncManager.remote(config) - -# 注册workers -success = ray.get(sync_manager.register_workers.remote(actor_workers, rollout_workers)) - -# 执行同步 -success = ray.get(sync_manager.sync_parameters.remote()) -``` - -### 3. 传统模式 -```python -from recipe.fully_async_policy.param_sync import UnifiedParameterSynchronizer - -# 创建传统模式的参数同步器 -synchronizer = UnifiedParameterSynchronizer(config) - -# 初始化同步组 -success = synchronizer.initialize_sync_group(actor_workers, rollout_workers) - -# 同步权重 -success = synchronizer.sync_weights(actor_workers, rollout_workers) -``` - -## 🔄 向后兼容性 - -为了确保现有代码的兼容性,提供了以下别名: - -```python -# 这些别名指向 UnifiedParameterSynchronizer -ParameterSynchronizer = UnifiedParameterSynchronizer -AsyncParameterSynchronizer = UnifiedParameterSynchronizer - -# Ray remote版本 -ParameterSyncManager = ray.remote(UnifiedParameterSynchronizer) -``` - -现有代码无需修改即可使用新的统一同步器。 - -## ⚙️ 初始化参数 - -```python -def __init__(self, config, trainer_actor=None, rollouter_actor=None, as_ray_actor=False): -``` - -- `config`: 配置对象(必需) -- `trainer_actor`: trainer actor引用(用于async模式) -- `rollouter_actor`: rollouter actor引用(用于async模式) -- `as_ray_actor`: 是否作为Ray actor使用 - -## 📊 主要方法 - -### 异步模式 -- `sync_to_rollouter(new_version)`: 同步参数到rollouter -- `get_current_version()`: 获取当前参数版本 - -### Ray Actor模式 -- `register_workers(actor_workers, rollout_workers)`: 注册workers -- `sync_parameters()`: 执行参数同步 - -### 传统模式 -- `initialize_sync_group(actor_workers, rollout_workers)`: 初始化同步组 -- `sync_weights(actor_workers, rollout_workers)`: 同步权重 - -### 通用方法 -- `get_statistics()`: 获取统计信息 -- `get_weights_info()`: 获取权重信息 -- `cleanup()`: 清理资源 - -## 📈 统计信息 - -```python -stats = synchronizer.get_statistics() -# 返回: -{ - "sync_count": 15, - "sync_failures": 0, - "last_sync_time": 1640995200.0, - "sync_group_initialized": True, - "current_param_version": 15, - "current_version": 15, - "is_ready": True # 仅在Ray actor模式下 -} -``` - -## 🎯 优势 - -1. **统一接口**: 一个类支持所有同步模式 -2. **向后兼容**: 现有代码无需修改 -3. **灵活配置**: 支持多种初始化方式 -4. **完整功能**: 包含所有原有类的功能 -5. **简化维护**: 减少代码重复,便于维护 - -## 🔧 配置示例 - -```yaml -async_training: - max_sync_retries: 3 - sync_timeout: 30.0 - sync_retry_delay: 1.0 - sync_monitor_interval: 60.0 - staleness_threshold: 3 -``` - ---- - -*统一参数同步器简化了参数同步的使用,同时保持了所有原有功能的完整性。* - diff --git a/recipe/fully_async_policy/fully_async_main.py b/recipe/fully_async_policy/fully_async_main.py index aa5ac81f48a..cf5c0e29d5c 100644 --- a/recipe/fully_async_policy/fully_async_main.py +++ b/recipe/fully_async_policy/fully_async_main.py @@ -248,16 +248,18 @@ def _initialize_components(self, config) -> None: print("Setting up parameter synchronization...") from recipe.fully_async_policy.param_sync import ParameterSynchronizer - param_synchronizer = ParameterSynchronizer( + param_synchronizer = ParameterSynchronizer.remote( config=config, - actor_wg=self.components["trainer"], - rollout_wg=self.components["rollouter"], + trainer=self.components["trainer"], + rollouter=self.components["rollouter"], ) # 将参数同步器设置到trainer和rollouter ray.get(self.components["trainer"].set_parameter_synchronizer.remote(param_synchronizer)) ray.get(self.components["rollouter"].set_parameter_synchronizer.remote(param_synchronizer)) + ray.get(param_synchronizer.sync_weights.remote(0)) + self.components["param_synchronizer"] = param_synchronizer print("Parameter synchronizer initialized successfully") print("All components initialized successfully") @@ -332,6 +334,64 @@ def _run_training_loop(self): print("Training completed or interrupted") + def _cleanup_resources(self): + """清理所有资源""" + try: + # 关闭线程池 + if hasattr(self, 'thread_executor') and self.thread_executor: + print("Shutting down thread executor...") + self.thread_executor.shutdown(wait=True, timeout=10.0) + + # 清理logger + if hasattr(self, 'logger') and self.logger: + try: + if hasattr(self.logger, 'close'): + self.logger.close() + elif hasattr(self.logger, 'finish'): + self.logger.finish() + except Exception as e: + print(f"Error closing logger: {e}") + + # 清理validation logger + if hasattr(self, 'validation_generations_logger') and self.validation_generations_logger: + try: + if hasattr(self.validation_generations_logger, 'close'): + self.validation_generations_logger.close() + except Exception as e: + print(f"Error closing validation logger: {e}") + + # 清理异步rollout管理器 + if hasattr(self, "async_rollout_manager") and self.async_rollout_manager: + try: + if hasattr(self.async_rollout_manager, 'shutdown'): + self.async_rollout_manager.shutdown() + except Exception as e: + print(f"Error cleaning up async rollout manager: {e}") + + # 清理worker groups + if hasattr(self, 'rollout_wg') and self.rollout_wg: + try: + if hasattr(self.rollout_wg, 'shutdown'): + self.rollout_wg.shutdown() + except Exception as e: + print(f"Error cleaning up rollout worker group: {e}") + + # 强制垃圾回收 + import gc + gc.collect() + + except Exception as e: + print(f"Error during resource cleanup: {e}") + + def __del__(self): + """析构函数 - 确保资源清理""" + try: + if hasattr(self, 'running') and self.running: + print("Warning: FullyAsyncRollouter being deleted while still running") + self.shutdown() + except Exception as e: + print(f"Error in destructor: {e}") + @hydra.main(config_path="config", config_name="fully_async_ppo_trainer", version_base=None) def main(config): diff --git a/recipe/fully_async_policy/fully_async_rollouter.py b/recipe/fully_async_policy/fully_async_rollouter.py index c760215c580..1ca9c7b0d2e 100644 --- a/recipe/fully_async_policy/fully_async_rollouter.py +++ b/recipe/fully_async_policy/fully_async_rollouter.py @@ -155,6 +155,10 @@ def set_parameter_synchronizer(self, param_synchronizer): with self.lock: self.param_synchronizer = param_synchronizer + def get_rollout_wg(self): + """获取 rollout worker group""" + return self.rollout_wg + def _validate_config(self): # 验证异步训练配置 if not hasattr(self.config, "async_training"): diff --git a/recipe/fully_async_policy/fully_async_trainer.py b/recipe/fully_async_policy/fully_async_trainer.py index 20bae1a6a64..afef0968a04 100644 --- a/recipe/fully_async_policy/fully_async_trainer.py +++ b/recipe/fully_async_policy/fully_async_trainer.py @@ -46,36 +46,17 @@ class FullyAsyncTrainer(RayPPOTrainer): """ def __init__( - self, - config, - tokenizer, - role_worker_mapping: dict[Role, WorkerType], - resource_pool_manager: ResourcePoolManager, - ray_worker_group_cls: RayWorkerGroup = RayWorkerGroup, - processor=None, - reward_fn=None, - val_reward_fn=None, - device_name=None, + self, + config, + tokenizer, + role_worker_mapping: dict[Role, WorkerType], + resource_pool_manager: ResourcePoolManager, + ray_worker_group_cls: RayWorkerGroup = RayWorkerGroup, + processor=None, + reward_fn=None, + val_reward_fn=None, + device_name=None, ): - """ - Initialize distributed PPO trainer with Ray backend. - Note that this trainer runs on the driver process on a single CPU/GPU node. - - Args: - config: Configuration object containing training parameters. - tokenizer: Tokenizer used for encoding and decoding text. - role_worker_mapping (dict[Role, WorkerType]): Mapping from roles to worker classes. - resource_pool_manager (ResourcePoolManager): Manager for Ray resource pools. - ray_worker_group_cls (RayWorkerGroup, optional): Class for Ray worker groups. Defaults to RayWorkerGroup. - processor: Optional data processor, used for multimodal data - reward_fn: Function for computing rewards during training. - val_reward_fn: Function for computing rewards during validation. - train_dataset (Optional[Dataset], optional): Training dataset. Defaults to None. - val_dataset (Optional[Dataset], optional): Validation dataset. Defaults to None. - collate_fn: Function to collate data samples into batches. - train_sampler (Optional[Sampler], optional): Sampler for the training dataset. Defaults to None. - device_name (str, optional): Device name for training (e.g., "cuda", "cpu"). Defaults to None. - """ # Store the tokenizer for text processing self.tokenizer = tokenizer @@ -141,59 +122,9 @@ def set_parameter_synchronizer(self, param_synchronizer): with self.lock: self.param_synchronizer = param_synchronizer - def _get_actor_params(self): - """ - 获取actor参数 - 基于one_step_off_policy的实现 - """ - if not hasattr(self, "actor_wg") or self.actor_wg is None: - raise ValueError("Actor worker group not initialized") - - # 从actor worker group获取参数 - actor_workers = self.actor_wg.workers - if not actor_workers: - raise ValueError("No actor workers available") - - # 获取第一个actor worker的参数信息 - params_future = actor_workers[0]._get_actor_params.remote() - params = ray.get(params_future, timeout=10.0) - return params - - def get_actor_weights_info(self): - """ - 获取actor权重信息 - 基于one_step_off_policy的模式 - """ - if hasattr(self, "_weights_info") and self._weights_info is not None: - return self._weights_info - - if not hasattr(self, "actor_wg") or self.actor_wg is None: - raise ValueError("Actor worker group not initialized") - - # 从actor worker group获取权重信息 - weights_info_future = self.actor_wg.get_actor_weights_info.remote() - weights_info = ray.get(weights_info_future, timeout=10.0) - - # 缓存权重信息 - self._weights_info = weights_info[0] if isinstance(weights_info, list) else weights_info - return self._weights_info - - def sync_rollout_weights(self): - """ - 同步rollout权重 - Actor端的同步操作 - """ - if not hasattr(self, "actor_wg") or self.actor_wg is None: - logger.warning("Actor worker group not initialized for sync") - return False - - try: - # 触发actor worker group的参数同步 - sync_future = self.actor_wg.sync_rollout_weights.remote() - ray.get(sync_future, timeout=30.0) - logger.debug("Actor weights sync completed") - return True - - except Exception as e: - logger.error(f"Failed to sync actor weights: {e}") - return False + def get_actor_wg(self): + """获取 actor worker group""" + return self.actor_wg def _get_samples_from_queue(self) -> tuple[None, None] | tuple[int, Any]: """ @@ -457,22 +388,14 @@ def fit(self): # self._post_batch_processing(batch) print("step end") - # 在训练步骤结束后触发参数同步 self._trigger_parameter_sync_after_step() - - # TODO: make a canonical logger that supports various backend - print(data=metrics, step=self.global_steps) - # progress_bar.update(1) self.global_steps += 1 - print("is_last_step") - # if is_last_step: - # pprint(f"Final validation metrics: {last_val_metrics}") - # print("is_last_step") - # # progress_bar.close() - # return - ray.get(self.param_synchronizer.sync_weights.remote(self.global_steps)) + print(f"is_last_step {is_last_step}") + if is_last_step: + print("is_last_step") + return def get_statistics(self) -> dict: """获取训练统计信息""" @@ -494,79 +417,12 @@ def _trigger_parameter_sync_after_step(self): 在训练步骤结束后触发参数同步 这确保rollouter总是使用最新训练的参数 """ - if not self.param_synchronizer: - logger.debug("No parameter synchronizer available, skipping sync") - return - - try: - # 更新参数版本号 - new_version = self.current_param_version + 1 - - print( - f"[TRAINER] Triggering parameter sync after training step {self.global_steps}, version: {new_version}" - ) - logger.info(f"Triggering parameter sync after training step {self.global_steps}, version: {new_version}") - - # 异步触发参数同步,不阻塞训练流程 - import threading - - sync_thread = threading.Thread(target=self._async_parameter_sync, args=(new_version,), daemon=True) - sync_thread.start() - - except Exception as e: - logger.error(f"Error triggering parameter sync: {e}") - - def _async_parameter_sync(self, new_version: int): - """ - 异步执行参数同步,避免阻塞训练流程 - - Args: - new_version: 新的参数版本号 - """ - try: - # 执行参数同步 - success = self.param_synchronizer.sync_to_rollouter(new_version) - - if success: - # 更新本地参数版本 - with self.lock: - self.current_param_version = new_version - self.param_sync_count += 1 - - print(f"[TRAINER] Parameter sync completed successfully for version {new_version}") - logger.info(f"Parameter sync completed successfully for version {new_version}") - else: - print(f"[TRAINER] Parameter sync failed for version {new_version}") - logger.warning(f"Parameter sync failed for version {new_version}") - - except Exception as e: - logger.error(f"Error in async parameter sync: {e}") - - def update_param_version(self, param_version: int) -> bool: - """ - 更新trainer的参数版本,用于跟踪与rollouter的参数同步状态 - - Args: - param_version: 新的参数版本号 - - Returns: - bool: 是否成功更新 - """ - try: - with self.lock: - old_version = self.current_param_version - self.current_param_version = param_version - self.param_sync_count += 1 - - # 更新消息队列的参数版本 - if self.message_queue_client: - self.message_queue_client.update_param_version(param_version) - - print(f"Updated trainer param version from {old_version} to {param_version}") - return True - except Exception as e: - logger.error(f"Error updating param version: {e}") - return False + new_version = self.current_param_version + 1 + print( + f"[TRAINER] Triggering parameter sync after training step {self.global_steps}, version: {new_version}" + ) + logger.info(f"Triggering parameter sync after training step {self.global_steps}, version: {new_version}") + ray.get(self.param_synchronizer.sync_weights.remote(new_version)) def _compute_sample_freshness_metrics(self, batch_samples: list[QueueSample]) -> dict: """ diff --git a/recipe/fully_async_policy/param_sync.py b/recipe/fully_async_policy/param_sync.py index 10843302786..3657916dda0 100644 --- a/recipe/fully_async_policy/param_sync.py +++ b/recipe/fully_async_policy/param_sync.py @@ -28,18 +28,13 @@ class ParameterSynchronizer: 合并了原有的多个同步器类的功能 """ - def __init__(self, config, actor_wg, rollout_wg): - """ - 初始化统一参数同步器 - - Args: - config: 配置对象 - actor_wg: trainer actor引用(用于async模式) - rollout_wg: rollouter actor引用(用于async模式) - """ + def __init__(self, config, trainer, rollouter): + self.config = config - self.actor_wg = actor_wg - self.rollout_wg = rollout_wg + self.trainer = trainer + self.rollouter = rollouter + self.actor_wg = ray.get(trainer.get_actor_wg.remote()) + self.rollout_wg = ray.get(rollouter.get_rollout_wg.remote()) # 基础属性 self.weights_info = None @@ -78,5 +73,9 @@ def _init_sync_group(self): def sync_weights(self, version): self.current_version = version logger.debug(f"Starting weight synchronization (version {self.current_version})...") + + # TODO 暂停及恢复rollout + print("TODO 暂停及恢复rollout") self.actor_wg.sync_rollout_weights() ray.get(self.rollout_wg.sync_rollout_weights()) + print("sync_weights success") diff --git a/recipe/fully_async_policy/run_benchmark.sh b/recipe/fully_async_policy/run_benchmark.sh deleted file mode 100755 index f9bfaceaa32..00000000000 --- a/recipe/fully_async_policy/run_benchmark.sh +++ /dev/null @@ -1,307 +0,0 @@ -#!/usr/bin/env bash -set -xeuo pipefail - -# Benchmark script for fully_async_policy performance testing -# This script runs various performance tests to evaluate the async training system - -NUM_GPUS=${NUM_GPUS:-8} -ACTOR_STRATEGY=${ACTOR_STRATEGY:-"fsdp2"} - -# Download model if not exists -MODEL_ID=${MODEL_ID:-Qwen/Qwen2.5-0.5B-Instruct} -MODEL_PATH=${MODEL_PATH:-${HOME}/models/${MODEL_ID}} -huggingface-cli download "${MODEL_ID}" --local-dir "${MODEL_PATH}" - -# Create benchmark results directory -BENCHMARK_DIR="benchmark_results_$(date +%Y%m%d_%H%M%S)" -mkdir -p "${BENCHMARK_DIR}" - -echo "Starting fully_async_policy performance benchmark..." -echo "Results will be saved to: ${BENCHMARK_DIR}" - -# Benchmark parameters -n_gpus_rollout=2 -n_gpus_training=$((NUM_GPUS - n_gpus_rollout)) - -# Common parameters -train_prompt_bsz=16 -n_resp_per_prompt=4 -train_prompt_mini_bsz=4 -max_prompt_length=512 -max_response_length=1024 - -# Benchmark Test 1: Different staleness thresholds -echo "=== Benchmark Test 1: Staleness Threshold Impact ===" -staleness_values=(1 3 5 10) - -for staleness in "${staleness_values[@]}"; do - echo "Testing staleness threshold: ${staleness}" - - exp_name="benchmark-staleness-${staleness}" - log_file="${BENCHMARK_DIR}/staleness_${staleness}.log" - - timeout 300 python3 -m recipe.fully_async_policy.fully_async_main \ - data.train_files="${HOME}/data/gsm8k/train.parquet" \ - data.val_files="${HOME}/data/gsm8k/test.parquet" \ - data.prompt_key=prompt \ - data.truncation='left' \ - data.max_prompt_length=${max_prompt_length} \ - data.max_response_length=${max_response_length} \ - data.train_batch_size=${train_prompt_bsz} \ - actor_rollout_ref.rollout.n=${n_resp_per_prompt} \ - actor_rollout_ref.model.path="${MODEL_PATH}" \ - trainer.logger=['console'] \ - trainer.project_name='verl-benchmark' \ - trainer.experiment_name="${exp_name}" \ - trainer.val_before_train=False \ - trainer.test_freq=-1 \ - trainer.save_freq=-1 \ - trainer.total_epochs=1 \ - trainer.total_training_steps=10 \ - trainer.n_gpus_per_node=${n_gpus_training} \ - rollout.n_gpus_per_node=${n_gpus_rollout} \ - async_training.staleness_threshold=${staleness} \ - async_training.max_staleness_allowed=$((staleness + 2)) \ - > "${log_file}" 2>&1 || echo "Test with staleness ${staleness} timed out or failed" - - # Extract key metrics from log - if [ -f "${log_file}" ]; then - echo "=== Metrics for staleness=${staleness} ===" >> "${BENCHMARK_DIR}/summary.txt" - grep -E "(Generated.*batches|Dropped.*samples|param_version|Queue size)" "${log_file}" | tail -5 >> "${BENCHMARK_DIR}/summary.txt" || true - echo "" >> "${BENCHMARK_DIR}/summary.txt" - fi -done - -# Benchmark Test 2: Different queue sizes -echo "=== Benchmark Test 2: Queue Size Impact ===" -queue_sizes=(50 100 500 1000) - -for queue_size in "${queue_sizes[@]}"; do - echo "Testing queue size: ${queue_size}" - - exp_name="benchmark-queue-${queue_size}" - log_file="${BENCHMARK_DIR}/queue_${queue_size}.log" - - timeout 300 python3 -m recipe.fully_async_policy.fully_async_main \ - data.train_files="${HOME}/data/gsm8k/train.parquet" \ - data.val_files="${HOME}/data/gsm8k/test.parquet" \ - data.prompt_key=prompt \ - data.truncation='left' \ - data.max_prompt_length=${max_prompt_length} \ - data.max_response_length=${max_response_length} \ - data.train_batch_size=${train_prompt_bsz} \ - actor_rollout_ref.rollout.n=${n_resp_per_prompt} \ - actor_rollout_ref.model.path="${MODEL_PATH}" \ - trainer.logger=['console'] \ - trainer.project_name='verl-benchmark' \ - trainer.experiment_name="${exp_name}" \ - trainer.val_before_train=False \ - trainer.test_freq=-1 \ - trainer.save_freq=-1 \ - trainer.total_epochs=1 \ - trainer.total_training_steps=10 \ - trainer.n_gpus_per_node=${n_gpus_training} \ - rollout.n_gpus_per_node=${n_gpus_rollout} \ - async_training.max_queue_size=${queue_size} \ - > "${log_file}" 2>&1 || echo "Test with queue size ${queue_size} timed out or failed" - - # Extract key metrics from log - if [ -f "${log_file}" ]; then - echo "=== Metrics for queue_size=${queue_size} ===" >> "${BENCHMARK_DIR}/summary.txt" - grep -E "(Generated.*batches|Queue size|memory)" "${log_file}" | tail -5 >> "${BENCHMARK_DIR}/summary.txt" || true - echo "" >> "${BENCHMARK_DIR}/summary.txt" - fi -done - -# Benchmark Test 3: Different batch generation intervals -echo "=== Benchmark Test 3: Generation Interval Impact ===" -intervals=(0.0 0.1 0.5 1.0) - -for interval in "${intervals[@]}"; do - echo "Testing batch generation interval: ${interval}s" - - exp_name="benchmark-interval-${interval}" - log_file="${BENCHMARK_DIR}/interval_${interval}.log" - - timeout 300 python3 -m recipe.fully_async_policy.fully_async_main \ - data.train_files="${HOME}/data/gsm8k/train.parquet" \ - data.val_files="${HOME}/data/gsm8k/test.parquet" \ - data.prompt_key=prompt \ - data.truncation='left' \ - data.max_prompt_length=${max_prompt_length} \ - data.max_response_length=${max_response_length} \ - data.train_batch_size=${train_prompt_bsz} \ - actor_rollout_ref.rollout.n=${n_resp_per_prompt} \ - actor_rollout_ref.model.path="${MODEL_PATH}" \ - trainer.logger=['console'] \ - trainer.project_name='verl-benchmark' \ - trainer.experiment_name="${exp_name}" \ - trainer.val_before_train=False \ - trainer.test_freq=-1 \ - trainer.save_freq=-1 \ - trainer.total_epochs=1 \ - trainer.total_training_steps=10 \ - trainer.n_gpus_per_node=${n_gpus_training} \ - rollout.n_gpus_per_node=${n_gpus_rollout} \ - async_training.batch_generation_interval=${interval} \ - > "${log_file}" 2>&1 || echo "Test with interval ${interval} timed out or failed" - - # Extract key metrics from log - if [ -f "${log_file}" ]; then - echo "=== Metrics for interval=${interval}s ===" >> "${BENCHMARK_DIR}/summary.txt" - grep -E "(Generated.*batches|generation_timestamp)" "${log_file}" | tail -5 >> "${BENCHMARK_DIR}/summary.txt" || true - echo "" >> "${BENCHMARK_DIR}/summary.txt" - fi -done - -# Benchmark Test 4: Resource allocation comparison -echo "=== Benchmark Test 4: Resource Allocation Comparison ===" - -# Test different rollout/training GPU distributions -if [ "${NUM_GPUS}" -ge "6" ]; then - gpu_configs=( - "1,$((NUM_GPUS - 1))" # 1 rollout, rest training - "2,$((NUM_GPUS - 2))" # 2 rollout, rest training - "3,$((NUM_GPUS - 3))" # 3 rollout, rest training - ) - - for config in "${gpu_configs[@]}"; do - IFS=',' read -r rollout_gpus training_gpus <<< "$config" - - echo "Testing GPU allocation: ${rollout_gpus} rollout, ${training_gpus} training" - - exp_name="benchmark-gpu-${rollout_gpus}r-${training_gpus}t" - log_file="${BENCHMARK_DIR}/gpu_${rollout_gpus}_${training_gpus}.log" - - timeout 300 python3 -m recipe.fully_async_policy.fully_async_main \ - data.train_files="${HOME}/data/gsm8k/train.parquet" \ - data.val_files="${HOME}/data/gsm8k/test.parquet" \ - data.prompt_key=prompt \ - data.truncation='left' \ - data.max_prompt_length=${max_prompt_length} \ - data.max_response_length=${max_response_length} \ - data.train_batch_size=${train_prompt_bsz} \ - actor_rollout_ref.rollout.n=${n_resp_per_prompt} \ - actor_rollout_ref.model.path="${MODEL_PATH}" \ - trainer.logger=['console'] \ - trainer.project_name='verl-benchmark' \ - trainer.experiment_name="${exp_name}" \ - trainer.val_before_train=False \ - trainer.test_freq=-1 \ - trainer.save_freq=-1 \ - trainer.total_epochs=1 \ - trainer.total_training_steps=10 \ - trainer.n_gpus_per_node=${training_gpus} \ - rollout.n_gpus_per_node=${rollout_gpus} \ - > "${log_file}" 2>&1 || echo "Test with GPU config ${config} timed out or failed" - - # Extract key metrics from log - if [ -f "${log_file}" ]; then - echo "=== Metrics for ${rollout_gpus}r/${training_gpus}t GPUs ===" >> "${BENCHMARK_DIR}/summary.txt" - grep -E "(Generated.*batches|training.*steps|GPU)" "${log_file}" | tail -5 >> "${BENCHMARK_DIR}/summary.txt" || true - echo "" >> "${BENCHMARK_DIR}/summary.txt" - fi - done -fi - -# Benchmark Test 5: Pause/Resume Performance -echo "=== Benchmark Test 5: Pause/Resume Performance Test ===" -log_file="${BENCHMARK_DIR}/pause_resume.log" - -# Start the training in background -python3 -m recipe.fully_async_policy.fully_async_main \ - data.train_files="${HOME}/data/gsm8k/train.parquet" \ - data.val_files="${HOME}/data/gsm8k/test.parquet" \ - data.prompt_key=prompt \ - data.truncation='left' \ - data.max_prompt_length=${max_prompt_length} \ - data.max_response_length=${max_response_length} \ - data.train_batch_size=${train_prompt_bsz} \ - actor_rollout_ref.rollout.n=${n_resp_per_prompt} \ - actor_rollout_ref.model.path="${MODEL_PATH}" \ - trainer.logger=['console'] \ - trainer.project_name='verl-benchmark-pause' \ - trainer.experiment_name='pause-resume-test' \ - trainer.val_before_train=False \ - trainer.test_freq=-1 \ - trainer.save_freq=-1 \ - trainer.total_epochs=1 \ - trainer.total_training_steps=20 \ - trainer.n_gpus_per_node=${n_gpus_training} \ - rollout.n_gpus_per_node=${n_gpus_rollout} \ - > "${log_file}" 2>&1 & - -TRAINING_PID=$! - -# Note: In actual implementation, we would need a way to remotely control pause/resume -# This is a placeholder for testing the pause/resume functionality -echo "Training started with PID: ${TRAINING_PID}" -echo "Pause/resume testing would require remote control interface" >> "${BENCHMARK_DIR}/summary.txt" - -# Wait a bit and then kill the training (simulating early termination) -sleep 60 -if kill -0 $TRAINING_PID 2>/dev/null; then - echo "Stopping training process..." - kill $TRAINING_PID -fi - -# Generate performance report -echo "=== Generating Performance Report ===" -report_file="${BENCHMARK_DIR}/performance_report.md" - -cat > "${report_file}" << EOF -# Fully Async Policy Performance Benchmark Report - -**Date:** $(date) -**Hardware:** ${NUM_GPUS} GPUs -**Strategy:** ${ACTOR_STRATEGY} -**Model:** ${MODEL_ID} - -## Test Configuration -- Training Batch Size: ${train_prompt_bsz} -- Responses per Prompt: ${n_resp_per_prompt} -- Max Prompt Length: ${max_prompt_length} -- Max Response Length: ${max_response_length} - -## Results Summary -$(cat "${BENCHMARK_DIR}/summary.txt" 2>/dev/null || echo "No summary available") - -## Log Files -EOF - -# List all log files -for log_file in "${BENCHMARK_DIR}"/*.log; do - if [ -f "$log_file" ]; then - echo "- $(basename "${log_file}")" >> "${report_file}" - fi -done - -cat >> "${report_file}" << EOF - -## Key Findings -- **Staleness Impact:** Lower staleness thresholds may increase sample dropping but improve freshness -- **Queue Size Impact:** Larger queues provide better buffering but use more memory -- **Generation Interval:** Shorter intervals increase throughput but may stress the system -- **GPU Allocation:** Balance between generation and training capacity is crucial -- **Pause/Resume:** System should handle interruptions gracefully - -## Recommendations -1. Start with staleness_threshold=3 for good balance -2. Use queue_size=500-1000 for most workloads -3. Set generation_interval=0.1s for good performance -4. Allocate 2-3 GPUs for rollout in typical 8-GPU setups -5. Monitor queue utilization and adjust based on workload - -EOF - -echo "Benchmark completed!" -echo "Results saved to: ${BENCHMARK_DIR}/" -echo "Performance report: ${report_file}" - -# Print summary to console -if [ -f "${BENCHMARK_DIR}/summary.txt" ]; then - echo "" - echo "=== BENCHMARK SUMMARY ===" - cat "${BENCHMARK_DIR}/summary.txt" -fi - diff --git a/recipe/fully_async_policy/run_fully_async_example.sh b/recipe/fully_async_policy/run_fully_async_example.sh deleted file mode 100644 index cd2265cde0d..00000000000 --- a/recipe/fully_async_policy/run_fully_async_example.sh +++ /dev/null @@ -1,147 +0,0 @@ -#!/bin/bash -# Copyright 2025 Meituan Ltd. and/or its affiliates -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -set -x - -# 实验配置 -project_name='FullyAsyncPPO' -exp_name='async-qwen2.5-7b-test' - -# 模型和数据路径 -MODEL_PATH=${MODEL_PATH:-"Qwen/Qwen2.5-7B-Instruct"} -TRAIN_FILE=${TRAIN_FILE:-"~/data/train.parquet"} -VAL_FILE=${VAL_FILE:-"~/data/val.parquet"} - -# 硬件配置 -NNODES=${NNODES:-1} -NGPUS_PER_NODE=${NGPUS_PER_NODE:-8} - -# 异步训练资源分配 -n_gpus_rollout=3 # rollout专用GPU数量 -n_gpus_training=$((NGPUS_PER_NODE - n_gpus_rollout)) # 训练GPU数量 - -echo "===================================" -echo "完全异步PPO训练启动" -echo "===================================" -echo "模型路径: $MODEL_PATH" -echo "训练数据: $TRAIN_FILE" -echo "验证数据: $VAL_FILE" -echo "节点数: $NNODES" -echo "每节点GPU数: $NGPUS_PER_NODE" -echo "Rollout GPU数: $n_gpus_rollout" -echo "训练GPU数: $n_gpus_training" -echo "===================================" - -# 算法参数 -temperature=1.0 -top_p=1.0 -top_k=-1 - -# 序列长度 -max_prompt_length=1024 -max_response_length=1024 - -# 异步训练参数 -staleness_threshold=3 -max_queue_size=1000 -min_batch_count=1 -batch_timeout=30.0 - -# 训练参数 -train_batch_size=128 -total_training_steps=1000 -save_freq=100 -val_freq=50 - -# 设置环境变量 -export NCCL_DEBUG=WARN -export VLLM_USE_V1=1 -export VERL_LOGGING_LEVEL=INFO - -# 启动训练 -python -m recipe.one_step_off_policy.fully_async_main \ - trainer.project_name="$project_name" \ - trainer.experiment_name="$exp_name" \ - trainer.device=cuda \ - trainer.nnodes=$NNODES \ - trainer.n_gpus_per_node=$NGPUS_PER_NODE \ - data.train_files="$TRAIN_FILE" \ - data.val_files="$VAL_FILE" \ - data.train_batch_size=$train_batch_size \ - data.max_prompt_length=$max_prompt_length \ - data.max_response_length=$max_response_length \ - data.train_files="$TRAIN_FILE" \ - data.val_files="$VAL_FILE" \ - data.train_batch_size=$train_batch_size \ - data.max_prompt_length=$max_prompt_length \ - data.max_response_length=$max_response_length \ - \ - # 模型配置 - actor_rollout_ref.model.path="$MODEL_PATH" \ - actor_rollout_ref.model.lora_rank=64 \ - actor_rollout_ref.model.lora_alpha=128 \ - \ - # Rollout配置 - actor_rollout_ref.rollout.mode=async \ - actor_rollout_ref.rollout.n_gpus=$n_gpus_rollout \ - actor_rollout_ref.rollout.name=vllm \ - actor_rollout_ref.rollout.temperature=$temperature \ - actor_rollout_ref.rollout.top_k=$top_k \ - actor_rollout_ref.rollout.top_p=$top_p \ - actor_rollout_ref.rollout.tensor_model_parallel_size=1 \ - actor_rollout_ref.rollout.gpu_memory_utilization=0.6 \ - actor_rollout_ref.rollout.max_num_batched_tokens=8192 \ - actor_rollout_ref.rollout.free_cache_engine=true \ - actor_rollout_ref.rollout.enforce_eager=true \ - \ - # Actor配置 - actor_rollout_ref.actor.optim.lr=1e-6 \ - actor_rollout_ref.actor.ppo_mini_batch_size=32 \ - actor_rollout_ref.actor.use_dynamic_bsz=true \ - actor_rollout_ref.actor.fsdp_config.param_offload=false \ - actor_rollout_ref.actor.fsdp_config.optimizer_offload=false \ - \ - # Critic配置 - critic.model.path="$MODEL_PATH" \ - critic.optim.lr=1e-5 \ - critic.fsdp_config.param_offload=false \ - \ - # 异步训练配置 - async_training.staleness_threshold=$staleness_threshold \ - async_training.max_queue_size=$max_queue_size \ - async_training.min_batch_count=$min_batch_count \ - async_training.batch_timeout=$batch_timeout \ - \ - # 训练配置 - trainer.total_training_steps=$total_training_steps \ - trainer.save_freq=$save_freq \ - trainer.val_freq=$val_freq \ - trainer.critic_warmup=0 \ - \ - # 算法配置 - algorithm.adv_estimator=gae \ - algorithm.cliprange=0.2 \ - algorithm.vf_coeff=0.1 \ - algorithm.entropy_coeff=0.01 \ - algorithm.kl_coeff=0.1 \ - \ - # 日志配置 - trainer.logger='["console", "wandb"]' \ - trainer.val_before_train=false - -echo "===================================" -echo "完全异步PPO训练完成" -echo "===================================" - diff --git a/recipe/fully_async_policy/test_components_pytest.py b/recipe/fully_async_policy/unittest/test_components_pytest.py similarity index 100% rename from recipe/fully_async_policy/test_components_pytest.py rename to recipe/fully_async_policy/unittest/test_components_pytest.py From c819fe1b6a9dc1f1ea92469ddb35bb79c975ca50 Mon Sep 17 00:00:00 2001 From: ArronHZG Date: Fri, 8 Aug 2025 17:31:08 +0800 Subject: [PATCH 031/182] ParameterSynchronizer --- .../fully_async_policy/README_fully_async.md | 336 ------------- .../config/fully_async_ppo_trainer.yaml | 2 + recipe/fully_async_policy/fully_async_main.py | 99 +--- .../fully_async_rollouter.py | 207 ++------ .../fully_async_policy/fully_async_trainer.py | 45 +- recipe/fully_async_policy/message_queue.py | 6 +- recipe/fully_async_policy/param_sync.py | 18 +- .../unittest/test_components_pytest.py | 315 ------------- .../unittest/test_fully_async.py | 194 -------- .../unittest/test_fully_async_components.py | 444 ------------------ tests/special_e2e/run_fully_async_policy.sh | 2 + verl/trainer/ppo/ray_trainer.py | 2 +- 12 files changed, 91 insertions(+), 1579 deletions(-) delete mode 100644 recipe/fully_async_policy/README_fully_async.md delete mode 100644 recipe/fully_async_policy/unittest/test_components_pytest.py delete mode 100644 recipe/fully_async_policy/unittest/test_fully_async.py delete mode 100644 recipe/fully_async_policy/unittest/test_fully_async_components.py diff --git a/recipe/fully_async_policy/README_fully_async.md b/recipe/fully_async_policy/README_fully_async.md deleted file mode 100644 index 916633a4a81..00000000000 --- a/recipe/fully_async_policy/README_fully_async.md +++ /dev/null @@ -1,336 +0,0 @@ -# 完全异步PPO训练系统 (Fully Async Policy) - -本文档介绍了基于 OneStepOffRayTrainer 成熟实现改进的完全异步PPO训练系统,该系统实现了 Trainer 和 Rollouter 的完全解耦,支持异步样本生成和训练。 - -## 🚀 **系统特性** - -### 核心特性 -- **完全异步训练**: Trainer 和 Rollouter 在独立的Ray Actor中运行,实现真正的并行处理 -- **智能新鲜度控制**: 基于参数版本和时间戳的样本新鲜度管理,防止过期样本影响训练 -- **健壮的参数同步**: 改进的参数同步机制,支持错误重试和状态管理 -- **简化的消息队列**: 去除ZeroMQ依赖,使用Ray-based消息传递,更稳定可靠 -- **完善的监控**: 详细的性能指标和组件健康状态监控 - -### 改进亮点 -- **参考OneStepOffRayTrainer**: 使用成熟的训练逻辑,确保训练稳定性 -- **错误处理和恢复**: 完善的异常处理和资源清理机制 -- **组件协调**: 统一的组件生命周期管理和状态监控 -- **配置验证**: 智能的配置验证和默认值设置 - -## 🏗️ **系统架构** - -### 组件结构 - -``` -┌─────────────────┐ ┌─────────────────┐ ┌─────────────────┐ -│ FullyAsyncMain │────│ MessageQueue │────│ FullyAsyncTrainer│ -│ (Coordinator) │ │ (Ray Actor) │ │ (Ray Actor) │ -└─────────────────┘ └─────────────────┘ └─────────────────┘ - │ │ │ - └───────────────────────┼───────────────────────┘ - │ - ┌─────────────────┐ - │ Rollouter │ - │ (Ray Actor) │ - └─────────────────┘ - │ - ┌─────────────────┐ - │ ParameterSync │ - │ Manager │ - └─────────────────┘ -``` - -### 数据流 - -``` -1. 数据生成: Rollouter → MessageQueue -2. 训练消费: MessageQueue → FullyAsyncTrainer -3. 参数同步: FullyAsyncTrainer → Rollouter -4. 状态监控: FullyAsyncMain → All Components -``` - -## 📋 **核心组件** - -### 1. FullyAsyncTrainer -- **功能**: 从MessageQueue获取样本进行异步训练 -- **特性**: - - 基于OneStepOffRayTrainer的成熟训练逻辑 - - 智能的样本新鲜度指标计算 - - 完善的错误处理和重试机制 - - 详细的训练性能监控 - -### 2. Rollouter -- **功能**: 持续生成训练样本并放入MessageQueue -- **特性**: - - 智能的暂停/恢复控制机制 - - 基于新鲜度的生成控制 - - 改进的参数同步处理 - - 异步/同步生成模式支持 - -### 3. MessageQueue -- **功能**: Ray-based消息队列,管理样本传递 -- **特性**: - - 去除ZeroMQ依赖,更稳定可靠 - - 智能的样本过期检测 - - 线程安全的队列操作 - - 内存使用监控 - -### 4. ParameterSynchronizer -- **功能**: 管理Actor和Rollout间的参数同步 -- **特性**: - - 支持错误重试和超时处理 - - 详细的同步状态跟踪 - - 集群通信组管理 - -### 5. FullyAsyncMain -- **功能**: 系统协调器,管理所有组件的生命周期 -- **特性**: - - 统一的组件初始化和清理 - - 实时的健康状态监控 - - 优雅的关闭和错误恢复 - -## ⚙️ **配置说明** - -### 异步训练配置 (async_training) - -```yaml -async_training: - # 新鲜度控制 - staleness_threshold: 3 # 样本新鲜度阈值 - max_staleness_allowed: 5 # 最大允许的样本陈旧度 - - # 队列管理 - max_queue_size: 1000 # 消息队列最大大小 - min_batch_count: 1 # 每次获取的最小batch数量 - batch_timeout: 30.0 # 获取batch的超时时间 - - # 生成控制 - generation_timeout: 30.0 # 单次生成的超时时间 - batch_generation_interval: 0.1 # batch生成间隔 - - # 参数同步 - max_sync_retries: 3 # 参数同步最大重试次数 - sync_timeout: 30.0 # 同步超时时间 - sync_retry_delay: 1.0 # 重试延迟时间 -``` - -### 资源配置 - -```yaml -trainer: - n_gpus_per_node: 4 # 每个训练节点的GPU数量 - nnodes: 2 # 训练节点数量 - device: cuda - -rollout: - n_gpus_per_node: 2 # 每个rollout节点的GPU数量 - nnodes: 1 # rollout节点数量 -``` - -## 🔧 **使用方法** - -### 1. 基本运行 - -```bash -# 使用默认配置运行 -python fully_async_main.py - -# 使用自定义配置 -python fully_async_main.py --config-path /path/to/config --config-name my_config -``` - -### 2. 配置自定义 - -```python -# 在配置文件中自定义异步训练参数 -async_training: - staleness_threshold: 5 - max_queue_size: 2000 - generation_timeout: 60.0 -``` - -### 3. 监控和调试 - -```python -# 系统会自动输出详细的统计信息 -# 包括: Trainer状态、Rollouter状态、队列状态等 - -# 日志文件: fully_async_training.log -# 包含所有组件的详细日志信息 -``` - -## 📊 **性能监控** - -### 关键指标 - -#### Trainer指标 -- `global_steps`: 训练步数 -- `processed_samples`: 已处理样本数 -- `current_param_version`: 当前参数版本 -- `param_sync_count`: 参数同步次数 - -#### Rollouter指标 -- `total_generated_samples`: 总生成样本数 -- `dropped_stale_samples`: 丢弃的过期样本数 -- `generation_errors`: 生成错误数 -- `param_sync_requests`: 参数同步请求数 - -#### 新鲜度指标 -- `avg_sample_age`: 样本平均年龄 -- `max_sample_age`: 样本最大年龄 -- `stale_samples_ratio`: 过期样本比例 - -#### 队列指标 -- `queue_size`: 当前队列大小 -- `total_produced`: 总生产样本数 -- `total_consumed`: 总消费样本数 -- `dropped_samples`: 总丢弃样本数 - -## 🔍 **故障排查** - -### 常见问题 - -1. **样本生成过慢** - - 检查 `generation_timeout` 设置 - - 监控 `generation_errors` 指标 - - 调整 `batch_generation_interval` - -2. **样本过期严重** - - 调整 `staleness_threshold` - - 检查参数同步频率 - - 监控 `stale_samples_ratio` - -3. **队列溢出** - - 增加 `max_queue_size` - - 优化训练速度 - - 调整 `min_batch_count` - -4. **参数同步失败** - - 检查 `sync_timeout` 设置 - - 监控 `sync_failures` 指标 - - 调整 `max_sync_retries` - -### 日志分析 - -```bash -# 查看主要错误 -grep "ERROR" fully_async_training.log - -# 查看组件统计 -grep "Component Statistics" fully_async_training.log - -# 查看参数同步状态 -grep "Parameter sync" fully_async_training.log -``` - -## 🚀 **性能优化建议** - -### 1. 资源配置优化 -- 根据模型大小合理配置GPU数量 -- 训练和rollout使用独立的资源池 -- 考虑内存和计算的平衡 - -### 2. 新鲜度控制优化 -- 根据模型收敛速度调整新鲜度阈值 -- 监控样本年龄分布,避免过度丢弃 -- 动态调整队列大小 - -### 3. 参数同步优化 -- 合理设置同步频率,平衡性能和一致性 -- 使用异步同步减少等待时间 -- 监控同步耗时,及时发现问题 - -## 🔧 **扩展和定制** - -### 自定义组件 - -```python -# 自定义Trainer -class CustomFullyAsyncTrainer(FullyAsyncTrainer): - def _compute_custom_metrics(self, batch): - # 添加自定义指标计算 - pass - -# 自定义Rollouter -class CustomRollouter(Rollouter): - def _custom_generation_logic(self, batch): - # 添加自定义生成逻辑 - pass -``` - -### 自定义监控 - -```python -# 添加自定义监控指标 -def custom_monitor(trainer_stats, rollouter_stats): - # 实现自定义监控逻辑 - custom_metric = calculate_custom_metric(trainer_stats) - logger.info(f"Custom metric: {custom_metric}") -``` - -## 📚 **与OneStepOffRayTrainer的对比** - -| 特性 | OneStepOffRayTrainer | FullyAsyncTrainer | -|------|---------------------|------------------| -| 训练模式 | 同步批处理 | 异步流处理 | -| 参数更新 | 批次同步更新 | 实时异步更新 | -| 资源利用 | 阶段性利用 | 持续高效利用 | -| 新鲜度控制 | 无需考虑 | 智能控制 | -| 复杂度 | 相对简单 | 更复杂但更灵活 | -| 适用场景 | 标准训练 | 大规模持续训练 | - -## 📖 **最佳实践** - -1. **配置调优**: 从默认配置开始,根据监控指标逐步优化 -2. **资源规划**: 合理分配训练和生成资源,避免瓶颈 -3. **监控预警**: 设置关键指标的阈值报警 -4. **定期检查**: 定期检查日志和性能指标 -5. **版本管理**: 记录配置变更和性能影响 - -## 🤝 **贡献和反馈** - -欢迎提交issue和PR来改进这个异步训练系统! - -## 📄 **更新日志** - -### v2.0 (改进版本) -- ✅ 基于OneStepOffRayTrainer重构训练逻辑 -- ✅ 简化MessageQueue实现,去除ZeroMQ依赖 -- ✅ 改进参数同步机制,支持错误重试 -- ✅ 完善组件协调和监控系统 -- ✅ 优化错误处理和资源管理 -- ✅ 增加详细的性能指标和日志 - -### v1.0 (原始版本) -- 基础异步训练框架 -- 简单的消息队列实现 -- 基本的参数同步功能 - - -```python -DataProtoItem( - batch=TensorDict( - fields={ - attention_mask: Tensor(shape=torch.Size([3072]), device=cpu, dtype=torch.int64, is_shared=False), - input_ids: Tensor(shape=torch.Size([3072]), device=cpu, dtype=torch.int64, is_shared=False), - position_ids: Tensor(shape=torch.Size([3072]), device=cpu, dtype=torch.int64, is_shared=False), - prompts: Tensor(shape=torch.Size([1024]), device=cpu, dtype=torch.int64, is_shared=False), - response_mask: Tensor(shape=torch.Size([2048]), device=cpu, dtype=torch.int64, is_shared=False), - responses: Tensor(shape=torch.Size([2048]), device=cpu, dtype=torch.int64, is_shared=False)}, - batch_size=torch.Size([]), - device=None, - is_shared=False), - non_tensor_batch={'data_source': 'openai/gsm8k', - 'ability': 'math', - 'reward_model': {'ground_truth': '35', 'style': 'rule'}, - 'extra_info': { - 'answer': 'The total number of green and red plates is 28 + 21 = <<28+21=49>>49.\nXavier should buy 84 − 49 = 35 more plates.\n#### 35', - 'index': 1421, - 'question': 'Xavier needs 84 paper plates for a housewarming party. He already has 21 green plates and 28 red plates. How many more plates should Xavier buy?', 'split': 'train'}, - 'uid': 'fab3e910-67b3-4653-bc69-377250049267', - 'tools_kwargs': {}, - 'interaction_kwargs': {}, - 'index': 1421}, - meta_info={'global_token_num': [2141, 2141, 2161, 2151, 2151, 2130, 2141, 2161, 2161, 2151, 2130, 2130]}) -``` - diff --git a/recipe/fully_async_policy/config/fully_async_ppo_trainer.yaml b/recipe/fully_async_policy/config/fully_async_ppo_trainer.yaml index f9aa06cd4b6..a5f58fadc2f 100644 --- a/recipe/fully_async_policy/config/fully_async_ppo_trainer.yaml +++ b/recipe/fully_async_policy/config/fully_async_ppo_trainer.yaml @@ -24,6 +24,8 @@ rollout: mode: async # rollout模式: sync, async name: vllm # rollout引擎: vllm, sglang n: 4 # 每个prompt生成的响应数量 + total_rollout_steps: 100 + total_epochs: 10 data: gen_batch_size: 32 diff --git a/recipe/fully_async_policy/fully_async_main.py b/recipe/fully_async_policy/fully_async_main.py index cf5c0e29d5c..31541982dd7 100644 --- a/recipe/fully_async_policy/fully_async_main.py +++ b/recipe/fully_async_policy/fully_async_main.py @@ -153,27 +153,11 @@ def __init__(self): def run(self, config): """运行完全异步的PPO训练""" print("Starting fully async PPO training...") - # 设置信号处理 - self._setup_signal_handlers() # 初始化基础组件 self._initialize_components(config) - # time.sleep(60) # 启动训练流程 self._run_training_loop() - # self._cleanup_resources() - - def _setup_signal_handlers(self): - """设置信号处理器""" - - def signal_handler(signum, frame): - print(f"Received signal {signum}, initiating shutdown...") - self.running = False - self.shutdown_event.set() - - signal.signal(signal.SIGINT, signal_handler) - signal.signal(signal.SIGTERM, signal_handler) - def _initialize_components(self, config) -> None: """ 初始化所有组件 @@ -225,10 +209,10 @@ def _initialize_components(self, config) -> None: # 创建MessageQueue self.max_queue_size = ( - config.async_training.staleness_threshold - * config.data.train_batch_size - * config.actor_rollout_ref.rollout.n - ) + config.async_training.staleness_threshold + * config.data.train_batch_size + * config.actor_rollout_ref.rollout.n + ) * 10 # x 10 避免死锁 print("Creating MessageQueue...") message_queue = MessageQueue.remote(config, self.max_queue_size) message_queue_client = MessageQueueClient(message_queue) @@ -237,7 +221,7 @@ def _initialize_components(self, config) -> None: self.components["message_queue_client"] = message_queue_client # 创建Rollouter - print("Creating Rollouter...") + print("Creating FullyAsyncRollouter...") self._create_rollouter(config) # 创建Trainer @@ -252,16 +236,17 @@ def _initialize_components(self, config) -> None: config=config, trainer=self.components["trainer"], rollouter=self.components["rollouter"], + mq=self.components["message_queue_client"], ) # 将参数同步器设置到trainer和rollouter ray.get(self.components["trainer"].set_parameter_synchronizer.remote(param_synchronizer)) ray.get(self.components["rollouter"].set_parameter_synchronizer.remote(param_synchronizer)) + # 首先同步一次参数 ray.get(param_synchronizer.sync_weights.remote(0)) self.components["param_synchronizer"] = param_synchronizer - print("Parameter synchronizer initialized successfully") print("All components initialized successfully") def _create_rollouter(self, config) -> None: @@ -277,21 +262,14 @@ def _create_rollouter(self, config) -> None: device_name=config.trainer.device, max_queue_size=self.max_queue_size, ) - print(rollouter) - - print("========== rollouter init workers ======") - # 初始化Rollouter ray.get(rollouter.init_workers.remote()) - ray.get(rollouter.set_message_queue_client.remote(self.components["message_queue_client"])) - self.components["rollouter"] = rollouter print("Rollouter created and initialized successfully") def _create_trainer(self, config) -> None: """创建Trainer""" - # 创建trainer角色映射(排除Rollout) trainer_role_mapping = { role: worker_cls for role, worker_cls in self.components["role_worker_mapping"].items() @@ -324,74 +302,13 @@ def _run_training_loop(self): rollouter_future = self.components["rollouter"].fit.remote() trainer_future = self.components["trainer"].fit.remote() - print("Starting Trainer...") - time.sleep(10) - print("Starting Trainer...") - ray.get(rollouter_future) ray.get(trainer_future) + self.components["message_queue_client"].clear_queue() print("Training completed or interrupted") - def _cleanup_resources(self): - """清理所有资源""" - try: - # 关闭线程池 - if hasattr(self, 'thread_executor') and self.thread_executor: - print("Shutting down thread executor...") - self.thread_executor.shutdown(wait=True, timeout=10.0) - - # 清理logger - if hasattr(self, 'logger') and self.logger: - try: - if hasattr(self.logger, 'close'): - self.logger.close() - elif hasattr(self.logger, 'finish'): - self.logger.finish() - except Exception as e: - print(f"Error closing logger: {e}") - - # 清理validation logger - if hasattr(self, 'validation_generations_logger') and self.validation_generations_logger: - try: - if hasattr(self.validation_generations_logger, 'close'): - self.validation_generations_logger.close() - except Exception as e: - print(f"Error closing validation logger: {e}") - - # 清理异步rollout管理器 - if hasattr(self, "async_rollout_manager") and self.async_rollout_manager: - try: - if hasattr(self.async_rollout_manager, 'shutdown'): - self.async_rollout_manager.shutdown() - except Exception as e: - print(f"Error cleaning up async rollout manager: {e}") - - # 清理worker groups - if hasattr(self, 'rollout_wg') and self.rollout_wg: - try: - if hasattr(self.rollout_wg, 'shutdown'): - self.rollout_wg.shutdown() - except Exception as e: - print(f"Error cleaning up rollout worker group: {e}") - - # 强制垃圾回收 - import gc - gc.collect() - - except Exception as e: - print(f"Error during resource cleanup: {e}") - - def __del__(self): - """析构函数 - 确保资源清理""" - try: - if hasattr(self, 'running') and self.running: - print("Warning: FullyAsyncRollouter being deleted while still running") - self.shutdown() - except Exception as e: - print(f"Error in destructor: {e}") - @hydra.main(config_path="config", config_name="fully_async_ppo_trainer", version_base=None) def main(config): diff --git a/recipe/fully_async_policy/fully_async_rollouter.py b/recipe/fully_async_policy/fully_async_rollouter.py index 1ca9c7b0d2e..d392e4a1630 100644 --- a/recipe/fully_async_policy/fully_async_rollouter.py +++ b/recipe/fully_async_policy/fully_async_rollouter.py @@ -34,17 +34,17 @@ class FullyAsyncRollouter(RayPPOTrainer): """ def __init__( - self, - config, - tokenizer, - role_worker_mapping: dict[Role, WorkerType], - resource_pool_manager: ResourcePoolManager, - ray_worker_group_cls: RayWorkerGroup = RayWorkerGroup, - processor=None, - reward_fn=None, - val_reward_fn=None, - device_name=None, - max_queue_size=1000, + self, + config, + tokenizer, + role_worker_mapping: dict[Role, WorkerType], + resource_pool_manager: ResourcePoolManager, + ray_worker_group_cls: RayWorkerGroup = RayWorkerGroup, + processor=None, + reward_fn=None, + val_reward_fn=None, + device_name=None, + max_queue_size=1000, ): """ Initialize distributed PPO trainer with Ray backend. @@ -99,6 +99,14 @@ def __init__( pprint(f"Rollouter _create_dataloader...\n{train_dataset}\n{val_dataset}") self._create_dataloader(train_dataset, val_dataset, collate_fn, train_sampler) + total_rollout_steps = len(self.train_dataloader) * self.config.trainer.total_epochs + + if self.config.rollout.total_rollout_steps is not None: + total_rollout_steps = self.config.rollout.total_rollout_steps + + self.total_rollout_steps = total_rollout_steps + print(f"Total rollout steps: {self.total_rollout_steps}") + # rollouter 参数配置 self.message_queue_client = None @@ -159,6 +167,13 @@ def get_rollout_wg(self): """获取 rollout worker group""" return self.rollout_wg + def update_param_version(self, version: int): + """更新当前参数版本""" + with self.lock: + old_version = self.current_param_version + self.current_param_version = version + print(f"Parameter version updated from {old_version} to {version}") + def _validate_config(self): # 验证异步训练配置 if not hasattr(self.config, "async_training"): @@ -184,18 +199,19 @@ def _create_continuous_iterator(self): """ Create a continuous data iterator across epoch """ - for epoch in range(self.config.trainer.total_epochs): + for epoch in range(self.config.rollout.total_epochs): iterator = iter(self.train_dataloader) for batch_dict in iterator: yield epoch, batch_dict def fit(self): """开始异步生成样本 - 改进的主运行逻辑""" - print("Starting Rollouter...") + print("Starting FullyAsyncRollouter...") + if self.message_queue_client is None: raise ValueError("MessageQueue client not set. Call set_message_queue_client() first.") - # if self.param_synchronizer is None: - # raise ValueError("param_synchronizer client not set. Call set_parameter_synchronizer() first.") + if self.param_synchronizer is None: + raise ValueError("param_synchronizer client not set. Call set_parameter_synchronizer() first.") # 设置运行状态 with self.lock: @@ -279,8 +295,11 @@ def _generation_loop(self): metrics = {} timing_raw = {} - batch, gen_batch = self._prepare_generate_batch(batch_dict) - is_last_step = self.global_steps >= self.total_training_steps + + with self.lock: + batch, gen_batch = self._prepare_generate_batch(batch_dict) + + is_last_step = self.global_steps >= self.total_rollout_steps # generate a batch with marked_timer("gen", timing_raw, color="red"): @@ -334,6 +353,12 @@ def _generation_loop(self): with self.lock: self.running = False + # 发送终止信号 + self.message_queue_client.put_sample( + sample=None, + param_version=self.current_param_version, + ) + def _monitor_loop(self): """监控线程 - 监控状态并处理控制信号""" # 主线程保持运行,处理控制信号和状态监控 @@ -390,7 +415,10 @@ def _should_pause_generation(self) -> bool: return True # 出错时暂停生成 def pause(self) -> bool: - """暂停生成 - 供外部调用""" + """暂停生成 + TODO 集成 Partial Rollout + """ + print("[rollouter] pause") with self.lock: if not self.running: return False @@ -402,7 +430,10 @@ def pause(self) -> bool: return True def resume(self) -> bool: - """恢复生成 - 供外部调用""" + """恢复生成 + TODO 集成 Partial Rollout + """ + print("[rollouter] resume") with self.lock: if not self.running: return False @@ -415,45 +446,6 @@ def resume(self) -> bool: print("Generation resumed") return True - def shutdown(self): - """关闭Rollouter - 改进的关闭逻辑""" - print("Shutting down Rollouter...") - - with self.lock: - self.running = False - self.paused = False - self.condition.notify_all() - - # 等待生成线程结束 - if self.generation_thread and self.generation_thread.is_alive(): - print("Waiting for generation thread to finish...") - self.generation_thread.join(timeout=10.0) - - if self.generation_thread.is_alive(): - print("Generation thread did not finish within timeout") - - # 等待监控线程结束 - if self.monitor_thread and self.monitor_thread.is_alive(): - print("Waiting for monitor thread to finish...") - self.monitor_thread.join(timeout=5.0) - - if self.monitor_thread.is_alive(): - print("Monitor thread did not finish within timeout") - - # 关闭线程池 - if self.thread_executor: - self.thread_executor.shutdown(wait=True) - - # 清理异步rollout管理器 - if hasattr(self, "async_rollout_manager"): - try: - # TODO: 添加异步rollout管理器的清理逻辑 - pass - except Exception as e: - print(f"Error cleaning up async rollout manager: {e}") - - print("Rollouter shutdown complete") - def get_statistics(self) -> dict: with self.lock: queue_stats = self.message_queue_client.get_statistics() @@ -468,102 +460,3 @@ def get_statistics(self) -> dict: "queue_size": f"{queue_stats['queue_size']}", } return stats - - def update_rollout_weights(self, param_version: int) -> bool: - """ - 更新rollout模型参数 - 改进的参数同步实现 - 这个方法由外部Trainer调用 - - Args: - param_version: 新的参数版本号 - - Returns: - bool: 是否成功更新参数 - """ - print(f"Updating rollout weights to version {param_version}") - - with self.sync_lock: - if self.sync_in_progress: - print(f"Sync already in progress, skipping version {param_version}") - return False - - self.sync_in_progress = True - - try: - # 暂停rollout - 带超时机制 - if not self.rollout_controller.pause(timeout=10.0): - print("Failed to pause rollout within timeout") - return False - - # 等待当前generation完成(如果有的话) - time.sleep(0.1) - - # 执行参数同步 - sync_success = self._execute_parameter_sync(param_version) - - if sync_success: - self.current_param_version = param_version - self.param_sync_requests += 1 - self.last_sync_time = time.time() - print(f"Successfully updated rollout weights to version {param_version}") - else: - print(f"Failed to sync parameters to version {param_version}") - - except Exception as e: - print(f"Error during parameter sync: {e}") - sync_success = False - finally: - # 恢复rollout - self.rollout_controller.resume() - self.sync_in_progress = False - - return sync_success - - def _execute_parameter_sync(self, param_version: int) -> bool: - """ - 执行实际的参数同步 - 改进的同步逻辑 - - Args: - param_version: 目标参数版本 - - Returns: - bool: 是否同步成功 - """ - try: - # 暂停推理引擎 - if self.async_rollout_mode and hasattr(self, "async_rollout_manager"): - # 对于异步模式,暂停服务器 - pass # 异步服务器的暂停在 pause() 中已经处理 - else: - # 对于同步模式,使用sleep/wake_up机制 - sleep_futures = self.rollout_wg.sleep() - ray.get(sleep_futures) - - # 执行参数同步 - if self.param_synchronizer: - self.param_synchronizer.sync_weights() - print("Parameter synchronization completed via synchronizer") - else: - # 直接使用rollout worker group的同步机制 - if hasattr(self.rollout_wg, "sync_rollout_weights"): - sync_futures = self.rollout_wg.sync_rollout_weights() - ray.get(sync_futures) - print("Parameter synchronization completed via rollout worker group") - else: - print("No parameter synchronization mechanism available") - return False - - # 恢复推理引擎 - if self.async_rollout_mode and hasattr(self, "async_rollout_manager"): - # 对于异步模式,恢复服务器 - pass # 异步服务器的恢复在 resume() 中已经处理 - else: - # 对于同步模式,唤醒workers - wake_futures = self.rollout_wg.wake_up() - ray.get(wake_futures) - - return True - - except Exception as e: - print(f"Parameter sync execution failed: {e}") - return False diff --git a/recipe/fully_async_policy/fully_async_trainer.py b/recipe/fully_async_policy/fully_async_trainer.py index afef0968a04..29a7a5c830b 100644 --- a/recipe/fully_async_policy/fully_async_trainer.py +++ b/recipe/fully_async_policy/fully_async_trainer.py @@ -105,7 +105,6 @@ def __init__( self.processed_samples = 0 self.stale_samples_processed = 0 self.current_param_version = 0 - self.param_sync_count = 0 # 参数同步相关状态 self._weights_info = None @@ -133,8 +132,6 @@ def _get_samples_from_queue(self) -> tuple[None, None] | tuple[int, Any]: Returns: tuple: (epoch, batch_dict, gen_batch_output) """ - if self.message_queue_client is None: - raise ValueError("MessageQueue client not set. Call set_message_queue_client() first.") # 计算需要获取的样本数量 n_responses_per_prompt = self.config.actor_rollout_ref.rollout.n @@ -268,11 +265,11 @@ def fit(self): to construct the PPO dataflow. The light-weight advantage computation is done on the driver process. """ - - print("FullyAsyncTrainer run") - + print("Starting FullyAsyncTrainer...") if self.message_queue_client is None: raise ValueError("MessageQueue client not set. Call set_message_queue_client() first.") + if self.param_synchronizer is None: + raise ValueError("param_synchronizer client not set. Call set_parameter_synchronizer() first.") from verl.utils.tracking import Tracking @@ -288,22 +285,9 @@ def fit(self): # load checkpoint before doing anything self._load_checkpoint() - # perform validation before training - # currently, we only support validation using the reward_function. - if self.val_reward_fn is not None and self.config.trainer.get("val_before_train", True): - val_metrics = self._validate() - assert val_metrics, f"{val_metrics=}" - pprint(f"Initial validation metrics: {val_metrics}") - print(data=val_metrics, step=self.global_steps) - if self.config.trainer.get("val_only", False): - return - # TODO 需要从 self.total_training_steps = self.config.trainer.total_training_steps print(f"Total training steps: {self.total_training_steps}") - # add tqdm - # progress_bar = tqdm(total=self.total_training_steps, initial=self.global_steps, desc="Training Progress") - # we start from step 1 self.global_steps += 1 last_val_metrics = None @@ -325,13 +309,6 @@ def fit(self): metrics = {} timing_raw = {} - do_profile = ( - self.global_steps in self.config.trainer.profile_steps - if self.config.trainer.profile_steps is not None - else False - ) - self._start_profiling(do_profile, timing_raw) - is_last_step = self.global_steps >= self.total_training_steps with marked_timer("step", timing_raw): @@ -384,13 +361,12 @@ def fit(self): # self._stop_profiling(do_profile, timing_raw) print("_collect_metrics") # self._collect_metrics(batch, epoch, metrics, timing_raw) - print("_post_batch_processing") - # self._post_batch_processing(batch) - print("step end") # 在训练步骤结束后触发参数同步 + print("_trigger_parameter_sync_after_step") + self._trigger_parameter_sync_after_step() - # progress_bar.update(1) + print("global_steps") self.global_steps += 1 print(f"is_last_step {is_last_step}") if is_last_step: @@ -405,7 +381,6 @@ def get_statistics(self) -> dict: "processed_samples": self.processed_samples, "stale_samples_processed": self.stale_samples_processed, "current_param_version": self.current_param_version, - "param_sync_count": self.param_sync_count, "queue_size": queue_stats.get("queue_size", 0), "queue_total_produced": queue_stats.get("total_produced", 0), "queue_total_consumed": queue_stats.get("total_consumed", 0), @@ -417,12 +392,12 @@ def _trigger_parameter_sync_after_step(self): 在训练步骤结束后触发参数同步 这确保rollouter总是使用最新训练的参数 """ - new_version = self.current_param_version + 1 + self.current_param_version = self.current_param_version + 1 print( - f"[TRAINER] Triggering parameter sync after training step {self.global_steps}, version: {new_version}" + f"[TRAINER] Triggering parameter sync after training step {self.global_steps}, version: {self.current_param_version}" ) - logger.info(f"Triggering parameter sync after training step {self.global_steps}, version: {new_version}") - ray.get(self.param_synchronizer.sync_weights.remote(new_version)) + logger.info(f"Triggering parameter sync after training step {self.global_steps}, version: {self.current_param_version}") + ray.get(self.param_synchronizer.sync_weights.remote(self.current_param_version)) def _compute_sample_freshness_metrics(self, batch_samples: list[QueueSample]) -> dict: """ diff --git a/recipe/fully_async_policy/message_queue.py b/recipe/fully_async_policy/message_queue.py index e5c382dec2a..47dbd34ecff 100644 --- a/recipe/fully_async_policy/message_queue.py +++ b/recipe/fully_async_policy/message_queue.py @@ -131,7 +131,11 @@ def get_samples(self, min_batch_count: int = 1) -> list[Any]: samples = [] for _ in range(batch_count): if self.queue: - samples.append(self.queue.popleft()) + data = self.queue.popleft() + if data is None: + return [] + else: + samples.append(self.queue.popleft()) self.total_consumed += len(samples) return samples diff --git a/recipe/fully_async_policy/param_sync.py b/recipe/fully_async_policy/param_sync.py index 3657916dda0..cb9baa5ff8a 100644 --- a/recipe/fully_async_policy/param_sync.py +++ b/recipe/fully_async_policy/param_sync.py @@ -28,11 +28,11 @@ class ParameterSynchronizer: 合并了原有的多个同步器类的功能 """ - def __init__(self, config, trainer, rollouter): - + def __init__(self, config, trainer, rollouter, mq): self.config = config self.trainer = trainer self.rollouter = rollouter + self.mq_client = mq self.actor_wg = ray.get(trainer.get_actor_wg.remote()) self.rollout_wg = ray.get(rollouter.get_rollout_wg.remote()) @@ -72,10 +72,18 @@ def _init_sync_group(self): def sync_weights(self, version): self.current_version = version - logger.debug(f"Starting weight synchronization (version {self.current_version})...") + print(f"Starting weight synchronization (version {self.current_version})...") + + print("pause rollout") + ray.get(self.rollouter.pause.remote()) + + # 更新MQ 版本 + self.mq_client.update_param_version(version) - # TODO 暂停及恢复rollout - print("TODO 暂停及恢复rollout") self.actor_wg.sync_rollout_weights() ray.get(self.rollout_wg.sync_rollout_weights()) + + # 更新 rollout 版本 + ray.get(self.rollouter.update_param_version.remote(version)) + ray.get(self.rollouter.resume.remote()) print("sync_weights success") diff --git a/recipe/fully_async_policy/unittest/test_components_pytest.py b/recipe/fully_async_policy/unittest/test_components_pytest.py deleted file mode 100644 index fd2e207cbe4..00000000000 --- a/recipe/fully_async_policy/unittest/test_components_pytest.py +++ /dev/null @@ -1,315 +0,0 @@ -# Copyright 2025 Meituan Ltd. and/or its affiliates -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -""" -Pytest测试文件,用于测试完全异步PPO训练系统的各个组件 -""" - -import time -from unittest.mock import Mock - -import pytest -import ray -from omegaconf import OmegaConf - - -@pytest.fixture -def ray_setup(): - """Ray初始化fixture""" - if not ray.is_initialized(): - ray.init(ignore_reinit_error=True, num_cpus=2) - yield - # 测试后不关闭Ray,因为其他测试可能还需要 - - -@pytest.fixture -def basic_config(): - """基本配置fixture""" - return OmegaConf.create( - { - "actor_rollout_ref": {"hybrid_engine": False, "model": {"lora_rank": 0}, "rollout": {"n": 2}}, - "algorithm": {"use_kl_in_reward": False}, - "critic": {"enable": False}, - "trainer": { - "device": "cpu", - "project_name": "test", - "experiment_name": "test", - "total_epochs": 1, - "total_training_steps": 2, - }, - "async_training": { - "staleness_threshold": 3, - "max_staleness_allowed": 5, - "generation_timeout": 10.0, - "batch_timeout": 5.0, - }, - "data": {"train_batch_size": 4}, - } - ) - - -class TestMessageQueue: - """测试MessageQueue功能""" - - def test_message_queue_creation(self, ray_setup): - """测试MessageQueue创建""" - try: - from message_queue import MessageQueueClient - - queue = MessageQueueClient.remote(max_queue_size=10, max_staleness=3) - - # 测试基本功能 - stats = ray.get(queue.get_statistics.remote()) - assert "queue_size" in stats - assert stats["queue_size"] == 0 - - ray.kill(queue) - - except ImportError: - pytest.skip("MessageQueue not available") - - def test_queue_put_get(self, ray_setup): - """测试队列的put/get操作""" - try: - from message_queue import MessageQueueClient - - queue = MessageQueueClient.remote(max_queue_size=10, max_staleness=3) - - # 创建模拟样本 - mock_sample = Mock() - mock_sample.batch_size = 4 - - # 测试放入样本 - success = ray.get( - queue.put_sample.remote( - epoch=1, sample=mock_sample, param_version=1, rollout_metadata={"timestamp": time.time()} - ) - ) - assert success - - # 测试获取样本 - result = ray.get(queue.get_samples.remote(min_batch_count=1, timeout=2.0, current_param_version=1)) - assert result is not None - - ray.kill(queue) - - except ImportError: - pytest.skip("MessageQueue not available") - - -class TestRollouter: - """测试Rollouter功能""" - - def test_rollouter_pause_resume(self, ray_setup, basic_config): - """测试Rollouter的暂停恢复功能""" - try: - from fully_async_rollouter import FullyAsyncRollouter - - # 创建模拟依赖 - mock_tokenizer = Mock() - mock_role_worker_mapping = {} - mock_resource_pool_manager = Mock() - - # 创建Rollouter - rollouter = FullyAsyncRollouter.remote( - config=basic_config, - tokenizer=mock_tokenizer, - role_worker_mapping=mock_role_worker_mapping, - resource_pool_manager=mock_resource_pool_manager, - ) - - # 测试暂停 - result = ray.get(rollouter.pause_rollout.remote()) - assert result is True - - # 检查状态 - is_paused = ray.get(rollouter.is_rollout_paused.remote()) - assert is_paused is True - - # 测试恢复 - result = ray.get(rollouter.resume_rollout.remote()) - assert result is True - - # 检查状态 - is_paused = ray.get(rollouter.is_rollout_paused.remote()) - assert is_paused is False - - ray.kill(rollouter) - - except ImportError: - pytest.skip("FullyAsyncRollouter not available") - - def test_rollouter_statistics(self, ray_setup, basic_config): - """测试Rollouter统计功能""" - try: - from fully_async_rollouter import FullyAsyncRollouter - - mock_tokenizer = Mock() - mock_role_worker_mapping = {} - mock_resource_pool_manager = Mock() - - rollouter = FullyAsyncRollouter.remote( - config=basic_config, - tokenizer=mock_tokenizer, - role_worker_mapping=mock_role_worker_mapping, - resource_pool_manager=mock_resource_pool_manager, - ) - - # 获取统计信息 - stats = ray.get(rollouter.get_statistics.remote()) - - # 验证必要字段存在 - required_fields = [ - "total_generated_samples", - "dropped_stale_samples", - "generation_errors", - "current_param_version", - "is_paused", - "pause_count", - ] - - for field in required_fields: - assert field in stats - - ray.kill(rollouter) - - except ImportError: - pytest.skip("FullyAsyncRollouter not available") - - -class TestTrainer: - """测试Trainer功能""" - - def test_trainer_creation(self, ray_setup, basic_config): - """测试Trainer创建""" - try: - from fully_async_trainer import FullyAsyncTrainer - - mock_tokenizer = Mock() - mock_role_worker_mapping = {} - mock_resource_pool_manager = Mock() - - trainer = FullyAsyncTrainer.remote( - config=basic_config, - tokenizer=mock_tokenizer, - role_worker_mapping=mock_role_worker_mapping, - resource_pool_manager=mock_resource_pool_manager, - ) - - # 基本验证 - assert trainer is not None - - ray.kill(trainer) - - except ImportError: - pytest.skip("FullyAsyncTrainer not available") - - -class TestParameterSync: - """测试参数同步功能""" - - def test_param_sync_creation(self, ray_setup): - """测试参数同步器创建""" - try: - from param_sync import ParameterSynchronizer - - config = OmegaConf.create( - {"async_training": {"max_sync_retries": 3, "sync_timeout": 10.0, "sync_retry_delay": 0.1}} - ) - - mock_actor_wg = Mock() - mock_rollout_wg = Mock() - - synchronizer = ParameterSynchronizer.remote( - config=config, actor_wg=mock_actor_wg, rollout_wg=mock_rollout_wg - ) - - assert synchronizer is not None - - ray.kill(synchronizer) - - except ImportError: - pytest.skip("ParameterSynchronizer not available") - - -class TestIntegration: - """集成测试""" - - def test_basic_workflow_simulation(self, ray_setup): - """测试基本工作流模拟""" - # 这是一个简化的集成测试,模拟基本的工作流 - try: - from message_queue import MessageQueueClient - - # 创建消息队列 - queue = MessageQueueClient.remote(max_queue_size=5, max_staleness=2) - - # 模拟生产者(Rollouter) - mock_sample = Mock() - mock_sample.batch_size = 2 - - # 放入样本 - success = ray.get( - queue.put_sample.remote( - epoch=1, sample=mock_sample, param_version=1, rollout_metadata={"timestamp": time.time()} - ) - ) - assert success - - # 模拟消费者(Trainer) - result = ray.get(queue.get_samples.remote(min_batch_count=1, timeout=2.0, current_param_version=1)) - assert result is not None - - samples, metadata_list = result - assert len(samples) == 1 - assert len(metadata_list) == 1 - - ray.kill(queue) - - except ImportError: - pytest.skip("Integration test components not available") - - -class TestErrorHandling: - """错误处理测试""" - - def test_timeout_handling(self, ray_setup): - """测试超时处理""" - try: - from message_queue import MessageQueueClient - - queue = MessageQueueClient.remote(max_queue_size=5, max_staleness=2) - - # 测试从空队列超时获取 - start_time = time.time() - result = ray.get( - queue.get_samples.remote( - min_batch_count=1, - timeout=1.0, # 1秒超时 - current_param_version=1, - ) - ) - elapsed = time.time() - start_time - - assert result is None - assert 0.9 <= elapsed <= 2.0 # 允许一些误差 - - ray.kill(queue) - - except ImportError: - pytest.skip("MessageQueue not available") - - -if __name__ == "__main__": - # 如果直接运行此文件,执行所有测试 - pytest.main([__file__, "-v"]) diff --git a/recipe/fully_async_policy/unittest/test_fully_async.py b/recipe/fully_async_policy/unittest/test_fully_async.py deleted file mode 100644 index 126ff489bf2..00000000000 --- a/recipe/fully_async_policy/unittest/test_fully_async.py +++ /dev/null @@ -1,194 +0,0 @@ -# Copyright 2025 Meituan Ltd. and/or its affiliates -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -""" -测试完全异步训练工作流的组件 -""" - -import logging -import unittest -from unittest.mock import Mock - -import ray -from omegaconf import OmegaConf - -from recipe.fully_async_policy.message_queue import DataProto, MessageQueue, MessageQueueClient - -logging.basicConfig(level=logging.INFO) -logger = logging.getLogger(__name__) - - -class TestMessageQueue(unittest.TestCase): - """测试MessageQueue组件""" - - def setUp(self): - """设置测试环境""" - if not ray.is_initialized(): - ray.init(local_mode=True) - - config = OmegaConf.create( - { - "async_training": { - "staleness_threshold": 3, - "max_staleness_allowed": 5, - } - } - ) - - self.message_queue = MessageQueue.remote(config, max_queue_size=100) - self.client = MessageQueueClient(self.message_queue) - - def tearDown(self): - """清理测试环境""" - ray.get(self.message_queue.shutdown.remote()) - if ray.is_initialized(): - ray.shutdown() - - def test_basic_put_get(self): - """测试基本的put和get操作""" - # 创建mock数据 - mock_batch = Mock(spec=DataProto) - - # 放入样本 - success = self.client.put_sample(sample=mock_batch, param_version=1, rollout_metadata={"test": "data"}) - self.assertTrue(success) - - # 获取样本 - samples = self.client.get_samples(min_batch_count=1, timeout=5.0) - self.assertIsNotNone(samples) - self.assertEqual(len(samples), 1) - self.assertEqual(samples[0].param_version, 1) - - def test_freshness_control(self): - """测试新鲜度控制""" - mock_batch = Mock(spec=DataProto) - - # 更新参数版本 - self.client.update_param_version(10) - - # 尝试放入过期样本 - success = self.client.put_sample( - sample=mock_batch, - param_version=5, # 版本差异为5,超过阈值3 - rollout_metadata={}, - ) - self.assertFalse(success) # 应该被拒绝 - - def test_queue_statistics(self): - """测试队列统计信息""" - stats = self.client.get_statistics() - self.assertIn("queue_size", stats) - self.assertIn("total_produced", stats) - self.assertIn("total_consumed", stats) - self.assertIn("dropped_samples", stats) - - -class TestRollouterComponents(unittest.TestCase): - """测试Rollouter相关组件""" - - def setUp(self): - """设置测试环境""" - from .fully_async_rollouter import RolloutController - - self.controller = RolloutController() - - def test_rollout_controller(self): - """测试rollout控制器""" - # 初始状态应该是运行的 - self.assertFalse(self.controller.is_paused) - - # 测试暂停 - self.controller.pause() - self.assertTrue(self.controller.is_paused) - - # 测试恢复 - self.controller.resume() - self.assertFalse(self.controller.is_paused) - - -class TestParameterSync(unittest.TestCase): - """测试参数同步组件""" - - def test_async_parameter_synchronizer(self): - """测试异步参数同步器""" - from recipe.fully_async_policy.param_sync import AsyncParameterSynchronizer - - config = OmegaConf.create({}) - mock_actor_wg = Mock() - mock_rollouter_actor = Mock() - - sync = AsyncParameterSynchronizer(config, mock_actor_wg, mock_rollouter_actor) - - self.assertEqual(sync.get_current_version(), 0) - - -def test_integration(): - """集成测试""" - logger.info("Starting integration test...") - - if not ray.is_initialized(): - ray.init(local_mode=True) - - try: - # 测试MessageQueue和客户端的集成 - config = OmegaConf.create( - { - "async_training": { - "staleness_threshold": 3, - "max_staleness_allowed": 5, - } - } - ) - - message_queue = MessageQueue.remote(config, max_queue_size=10) - client = MessageQueueClient(message_queue) - - # 模拟生产者-消费者场景 - mock_batch = Mock(spec=DataProto) - - # 生产样本 - for i in range(5): - success = client.put_sample(sample=mock_batch, param_version=i, rollout_metadata={"batch_id": i}) - assert success, f"Failed to put batch {i}" - - # 消费样本 - samples = client.get_samples(min_batch_count=3, timeout=10.0) - assert samples is not None, "Failed to get samples" - assert len(samples) == 3, f"Expected 3 samples, got {len(samples)}" - - # 检查统计信息 - stats = client.get_statistics() - assert stats["total_produced"] == 5 - assert stats["total_consumed"] == 3 - - logger.info("Integration test passed!") - - # 清理 - ray.get(message_queue.shutdown.remote()) - - finally: - if ray.is_initialized(): - ray.shutdown() - - -if __name__ == "__main__": - # 运行单元测试 - unittest.main(argv=[""], exit=False, verbosity=2) - - # 运行集成测试 - test_integration() - - print("\n" + "=" * 50) - print("所有测试完成!") - print("=" * 50) diff --git a/recipe/fully_async_policy/unittest/test_fully_async_components.py b/recipe/fully_async_policy/unittest/test_fully_async_components.py deleted file mode 100644 index 8a5bc85d562..00000000000 --- a/recipe/fully_async_policy/unittest/test_fully_async_components.py +++ /dev/null @@ -1,444 +0,0 @@ -# Copyright 2025 Meituan Ltd. and/or its affiliates -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -""" -单元测试文件,用于测试完全异步PPO训练系统的各个组件 -""" - -import os - -# Import components to test -import sys -import time -import unittest -from unittest.mock import Mock - -import ray -from omegaconf import OmegaConf - -sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) - -from fully_async_rollouter import FullyAsyncRollouter -from fully_async_trainer import FullyAsyncTrainer -from message_queue import MessageQueueClient -from param_sync import ParameterSynchronizer - - -class TestMessageQueue(unittest.TestCase): - """测试MessageQueue的功能""" - - def setUp(self): - """设置测试环境""" - if not ray.is_initialized(): - ray.init(ignore_reinit_error=True) - - # 创建MessageQueue客户端 - self.message_queue = MessageQueueClient.remote(max_queue_size=100, max_staleness=3) - - def tearDown(self): - """清理测试环境""" - if hasattr(self, "message_queue"): - ray.kill(self.message_queue) - - def test_put_and_get_samples(self): - """测试放入和获取样本的基本功能""" - # 创建模拟样本数据 - mock_sample = Mock() - mock_sample.batch_size = 4 - - # 测试放入样本 - success = ray.get( - self.message_queue.put_sample.remote( - epoch=1, sample=mock_sample, param_version=1, rollout_metadata={"timestamp": time.time()} - ) - ) - self.assertTrue(success) - - # 测试获取样本 - result = ray.get(self.message_queue.get_samples.remote(min_batch_count=1, timeout=5.0, current_param_version=1)) - - self.assertIsNotNone(result) - samples, metadata_list = result - self.assertEqual(len(samples), 1) - self.assertEqual(len(metadata_list), 1) - - def test_staleness_control(self): - """测试新鲜度控制功能""" - mock_sample = Mock() - mock_sample.batch_size = 4 - - # 放入一个参数版本较老的样本 - success = ray.get( - self.message_queue.put_sample.remote( - epoch=1, sample=mock_sample, param_version=1, rollout_metadata={"timestamp": time.time()} - ) - ) - self.assertTrue(success) - - # 尝试用较新的参数版本获取样本(应该被拒绝) - result = ray.get( - self.message_queue.get_samples.remote( - min_batch_count=1, - timeout=5.0, - current_param_version=5, # 版本差距为4 > max_staleness(3) - ) - ) - - # 应该返回空结果,因为样本过期 - self.assertIsNone(result) - - def test_queue_statistics(self): - """测试队列统计功能""" - # 获取初始统计 - stats = ray.get(self.message_queue.get_statistics.remote()) - initial_queue_size = stats["queue_size"] - - # 添加一些样本 - mock_sample = Mock() - mock_sample.batch_size = 4 - - for i in range(3): - ray.get( - self.message_queue.put_sample.remote( - epoch=1, sample=mock_sample, param_version=1, rollout_metadata={"timestamp": time.time()} - ) - ) - - # 检查统计是否更新 - stats = ray.get(self.message_queue.get_statistics.remote()) - self.assertEqual(stats["queue_size"], initial_queue_size + 3) - self.assertEqual(stats["total_produced"], 3) - - -class TestParameterSynchronizer(unittest.TestCase): - """测试参数同步器的功能""" - - def setUp(self): - """设置测试环境""" - if not ray.is_initialized(): - ray.init(ignore_reinit_error=True) - - self.config = OmegaConf.create( - {"async_training": {"max_sync_retries": 3, "sync_timeout": 10.0, "sync_retry_delay": 0.1}} - ) - - def test_sync_with_retry(self): - """测试带重试机制的参数同步""" - # 创建模拟的worker groups - mock_actor_wg = Mock() - mock_rollout_wg = Mock() - - # 模拟同步操作 - mock_actor_wg.get_weights.return_value = ray.put({"param1": "value1"}) - mock_rollout_wg.set_weights.return_value = [] - - synchronizer = ParameterSynchronizer.remote( - config=self.config, actor_wg=mock_actor_wg, rollout_wg=mock_rollout_wg - ) - - # 测试成功同步 - result = ray.get(synchronizer.sync_weights.remote()) - self.assertTrue(result) - - def test_sync_failure_and_retry(self): - """测试同步失败和重试机制""" - mock_actor_wg = Mock() - mock_rollout_wg = Mock() - - # 模拟同步失败 - mock_actor_wg.get_weights.side_effect = Exception("Sync failed") - - synchronizer = ParameterSynchronizer.remote( - config=self.config, actor_wg=mock_actor_wg, rollout_wg=mock_rollout_wg - ) - - # 测试失败时的重试 - result = ray.get(synchronizer.sync_weights.remote()) - self.assertFalse(result) - - -class TestFullyAsyncRollouter(unittest.TestCase): - """测试异步Rollouter的功能""" - - def setUp(self): - """设置测试环境""" - if not ray.is_initialized(): - ray.init(ignore_reinit_error=True) - - def test_pause_resume_functionality(self): - """测试暂停和恢复功能""" - # 创建配置 - config = OmegaConf.create( - { - "actor_rollout_ref": {"hybrid_engine": False, "model": {"lora_rank": 0}, "rollout": {"n": 2}}, - "algorithm": {"use_kl_in_reward": False}, - "critic": {"enable": False}, - "trainer": {"device": "cpu", "project_name": "test", "experiment_name": "test"}, - "async_training": { - "staleness_threshold": 3, - "max_staleness_allowed": 5, - "generation_timeout": 10.0, - "batch_generation_interval": 0.1, - }, - } - ) - - # 创建模拟的依赖 - mock_tokenizer = Mock() - mock_role_worker_mapping = Mock() - mock_resource_pool_manager = Mock() - - # 创建Rollouter实例 - rollouter = FullyAsyncRollouter.remote( - config=config, - tokenizer=mock_tokenizer, - role_worker_mapping=mock_role_worker_mapping, - resource_pool_manager=mock_resource_pool_manager, - ) - - # 测试暂停功能 - result = ray.get(rollouter.pause_rollout.remote()) - self.assertTrue(result) - - # 检查暂停状态 - is_paused = ray.get(rollouter.is_rollout_paused.remote()) - self.assertTrue(is_paused) - - # 测试恢复功能 - result = ray.get(rollouter.resume_rollout.remote()) - self.assertTrue(result) - - # 检查恢复状态 - is_paused = ray.get(rollouter.is_rollout_paused.remote()) - self.assertFalse(is_paused) - - def test_statistics_collection(self): - """测试统计信息收集功能""" - config = OmegaConf.create( - { - "actor_rollout_ref": {"hybrid_engine": False, "model": {"lora_rank": 0}, "rollout": {"n": 2}}, - "algorithm": {"use_kl_in_reward": False}, - "critic": {"enable": False}, - "trainer": {"device": "cpu", "project_name": "test", "experiment_name": "test"}, - "async_training": {"staleness_threshold": 3, "max_staleness_allowed": 5, "generation_timeout": 10.0}, - } - ) - - mock_tokenizer = Mock() - mock_role_worker_mapping = Mock() - mock_resource_pool_manager = Mock() - - rollouter = FullyAsyncRollouter.remote( - config=config, - tokenizer=mock_tokenizer, - role_worker_mapping=mock_role_worker_mapping, - resource_pool_manager=mock_resource_pool_manager, - ) - - # 获取统计信息 - stats = ray.get(rollouter.get_statistics.remote()) - - # 验证统计信息包含必要的字段 - expected_keys = [ - "total_generated_samples", - "dropped_stale_samples", - "generation_errors", - "current_param_version", - "is_paused", - "pause_count", - "resume_count", - ] - - for key in expected_keys: - self.assertIn(key, stats) - - -class TestFullyAsyncTrainer(unittest.TestCase): - """测试异步Trainer的功能""" - - def setUp(self): - """设置测试环境""" - if not ray.is_initialized(): - ray.init(ignore_reinit_error=True) - - def test_freshness_metrics_calculation(self): - """测试新鲜度指标计算""" - # 创建基本配置 - config = OmegaConf.create( - { - "trainer": { - "device": "cpu", - "project_name": "test", - "experiment_name": "test", - "total_epochs": 1, - "total_training_steps": 2, - }, - "async_training": {"staleness_threshold": 3, "max_staleness_allowed": 5, "batch_timeout": 10.0}, - "data": {"train_batch_size": 4}, - "actor_rollout_ref": {"hybrid_engine": False, "model": {"lora_rank": 0}}, - "algorithm": {"use_kl_in_reward": False}, - "critic": {"enable": False}, - } - ) - - # 创建模拟的依赖 - mock_tokenizer = Mock() - mock_role_worker_mapping = Mock() - mock_resource_pool_manager = Mock() - - trainer = FullyAsyncTrainer.remote( - config=config, - tokenizer=mock_tokenizer, - role_worker_mapping=mock_role_worker_mapping, - resource_pool_manager=mock_resource_pool_manager, - ) - - # 测试新鲜度指标计算 - current_time = time.time() - metadata_list = [ - {"generation_timestamp": current_time - 5, "rollout_param_version": 1}, - {"generation_timestamp": current_time - 10, "rollout_param_version": 2}, - {"generation_timestamp": current_time - 15, "rollout_param_version": 1}, - ] - - freshness_metrics = ray.get(trainer._calculate_freshness_metrics.remote(metadata_list, current_param_version=3)) - - # 验证新鲜度指标 - self.assertIn("avg_sample_age", freshness_metrics) - self.assertIn("max_sample_age", freshness_metrics) - self.assertIn("min_sample_age", freshness_metrics) - self.assertIn("version_diversity", freshness_metrics) - self.assertIn("staleness_ratio", freshness_metrics) - - -class TestIntegrationScenarios(unittest.TestCase): - """测试组件集成场景""" - - def setUp(self): - """设置测试环境""" - if not ray.is_initialized(): - ray.init(ignore_reinit_error=True) - - def test_message_queue_trainer_integration(self): - """测试MessageQueue与Trainer的集成""" - # 创建MessageQueue - message_queue = MessageQueueClient.remote(max_queue_size=10, max_staleness=3) - - # 放入一些测试样本 - mock_sample = Mock() - mock_sample.batch_size = 4 - - ray.get( - message_queue.put_sample.remote( - epoch=1, sample=mock_sample, param_version=1, rollout_metadata={"timestamp": time.time()} - ) - ) - - # 验证Trainer能够获取样本 - result = ray.get(message_queue.get_samples.remote(min_batch_count=1, timeout=5.0, current_param_version=1)) - - self.assertIsNotNone(result) - samples, metadata_list = result - self.assertEqual(len(samples), 1) - - def test_rollouter_message_queue_integration(self): - """测试Rollouter与MessageQueue的集成""" - # 这个测试需要更多的模拟设置,因为涉及到实际的模型生成 - # 在实际实现中,可以使用更多的Mock对象来模拟这种集成 - pass - - -class TestErrorHandling(unittest.TestCase): - """测试错误处理和边界情况""" - - def setUp(self): - """设置测试环境""" - if not ray.is_initialized(): - ray.init(ignore_reinit_error=True) - - def test_message_queue_overflow(self): - """测试消息队列溢出处理""" - # 创建小容量的队列 - message_queue = MessageQueueClient.remote(max_queue_size=2, max_staleness=3) - - mock_sample = Mock() - mock_sample.batch_size = 4 - - # 填满队列 - for i in range(2): - result = ray.get( - message_queue.put_sample.remote( - epoch=1, sample=mock_sample, param_version=1, rollout_metadata={"timestamp": time.time()} - ) - ) - self.assertTrue(result) - - # 尝试再放入一个样本(应该失败或者覆盖旧样本) - result = ray.get( - message_queue.put_sample.remote( - epoch=1, sample=mock_sample, param_version=1, rollout_metadata={"timestamp": time.time()} - ) - ) - - # 根据实现,这里可能是False(拒绝)或True(覆盖) - self.assertIsInstance(result, bool) - - def test_timeout_handling(self): - """测试超时处理""" - message_queue = MessageQueueClient.remote(max_queue_size=10, max_staleness=3) - - # 尝试从空队列获取样本,应该超时 - start_time = time.time() - result = ray.get( - message_queue.get_samples.remote( - min_batch_count=1, - timeout=1.0, # 1秒超时 - current_param_version=1, - ) - ) - elapsed = time.time() - start_time - - # 应该返回None并且大约在1秒后返回 - self.assertIsNone(result) - self.assertGreater(elapsed, 0.9) # 允许一些误差 - self.assertLess(elapsed, 2.0) - - -if __name__ == "__main__": - # 设置测试套件 - test_suite = unittest.TestSuite() - - # 添加测试用例 - test_classes = [ - TestMessageQueue, - TestParameterSynchronizer, - TestFullyAsyncRollouter, - TestFullyAsyncTrainer, - TestIntegrationScenarios, - TestErrorHandling, - ] - - for test_class in test_classes: - tests = unittest.TestLoader().loadTestsFromTestCase(test_class) - test_suite.addTests(tests) - - # 运行测试 - runner = unittest.TextTestRunner(verbosity=2) - result = runner.run(test_suite) - - # 清理Ray - if ray.is_initialized(): - ray.shutdown() - - # 退出 - exit(0 if result.wasSuccessful() else 1) diff --git a/tests/special_e2e/run_fully_async_policy.sh b/tests/special_e2e/run_fully_async_policy.sh index 50eb9070314..52a4d2bc8fd 100644 --- a/tests/special_e2e/run_fully_async_policy.sh +++ b/tests/special_e2e/run_fully_async_policy.sh @@ -121,6 +121,8 @@ common_params=( trainer.n_gpus_per_node=${n_gpus_training} rollout.nnodes=1 rollout.n_gpus_per_node=${n_gpus_rollout} + rollout.total_rollout_steps=100 + rollout.total_epochs=10 # Fully async specific configurations async_training.staleness_threshold=${staleness_threshold} async_training.sync_timeout=${sync_timeout} diff --git a/verl/trainer/ppo/ray_trainer.py b/verl/trainer/ppo/ray_trainer.py index 9b87d5a3bd8..89acaebfe03 100644 --- a/verl/trainer/ppo/ray_trainer.py +++ b/verl/trainer/ppo/ray_trainer.py @@ -1248,7 +1248,7 @@ def _process_batch_common(self, batch, metrics, timing_raw): reward_tensor, reward_extra_infos_dict = compute_reward(batch, self.reward_fn) # recompute old_log_probs with marked_timer("old_log_prob", timing_raw, color="blue"): - print("marked_timer rewold_log_prob") + print("marked_timer old_log_prob") old_log_prob = self.actor_rollout_wg.compute_log_prob(batch) entropys = old_log_prob.batch["entropys"] From 50cb8dfd799b903af4d4f00dcabd77b7fe4830d9 Mon Sep 17 00:00:00 2001 From: ArronHZG Date: Fri, 8 Aug 2025 18:46:05 +0800 Subject: [PATCH 032/182] stop train --- recipe/fully_async_policy/fully_async_trainer.py | 13 ++----------- recipe/fully_async_policy/message_queue.py | 9 ++++++--- tests/special_e2e/run_fully_async_policy.sh | 4 +--- 3 files changed, 9 insertions(+), 17 deletions(-) diff --git a/recipe/fully_async_policy/fully_async_trainer.py b/recipe/fully_async_policy/fully_async_trainer.py index 29a7a5c830b..588f5998fe7 100644 --- a/recipe/fully_async_policy/fully_async_trainer.py +++ b/recipe/fully_async_policy/fully_async_trainer.py @@ -285,9 +285,6 @@ def fit(self): # load checkpoint before doing anything self._load_checkpoint() - self.total_training_steps = self.config.trainer.total_training_steps - - print(f"Total training steps: {self.total_training_steps}") # we start from step 1 self.global_steps += 1 last_val_metrics = None @@ -309,7 +306,7 @@ def fit(self): metrics = {} timing_raw = {} - is_last_step = self.global_steps >= self.total_training_steps + is_last_step = False with marked_timer("step", timing_raw): with marked_timer("gen", timing_raw, color="red"): @@ -352,8 +349,6 @@ def fit(self): batch, reward_extra_infos_dict = self._process_batch_common(batch, metrics, timing_raw) print("_log_rollout") self._log_rollout(batch, reward_extra_infos_dict, timing_raw) - print("_validate_metrics") - last_val_metrics = self._validate_metrics(is_last_step, last_val_metrics, metrics, timing_raw) print("_check_save_checkpoint") self._check_save_checkpoint(is_last_step, timing_raw) @@ -366,12 +361,8 @@ def fit(self): print("_trigger_parameter_sync_after_step") self._trigger_parameter_sync_after_step() - print("global_steps") + print(f"global_steps: {self.global_steps}") self.global_steps += 1 - print(f"is_last_step {is_last_step}") - if is_last_step: - print("is_last_step") - return def get_statistics(self) -> dict: """获取训练统计信息""" diff --git a/recipe/fully_async_policy/message_queue.py b/recipe/fully_async_policy/message_queue.py index 47dbd34ecff..ad261b0072a 100644 --- a/recipe/fully_async_policy/message_queue.py +++ b/recipe/fully_async_policy/message_queue.py @@ -119,7 +119,10 @@ def get_samples(self, min_batch_count: int = 1) -> list[Any]: print("get_samples") with self.lock: while len(self.queue) < min_batch_count and self.running: - print("consumer_condition") + print(f"consumer_condition {len(self.queue)}") + for data in self.queue: + if data is None: + return [] self.consumer_condition.wait() # 如果队列已关闭且没有足够样本,返回空列表 @@ -135,7 +138,7 @@ def get_samples(self, min_batch_count: int = 1) -> list[Any]: if data is None: return [] else: - samples.append(self.queue.popleft()) + samples.append(data) self.total_consumed += len(samples) return samples @@ -174,7 +177,7 @@ def clear_queue(self): def shutdown(self): """关闭消息队列""" - with self.lock: # 修正:需要加锁 + with self.lock: self.running = False # 通知所有等待的线程,让它们能够退出 self.consumer_condition.notify_all() diff --git a/tests/special_e2e/run_fully_async_policy.sh b/tests/special_e2e/run_fully_async_policy.sh index 52a4d2bc8fd..c95476e898a 100644 --- a/tests/special_e2e/run_fully_async_policy.sh +++ b/tests/special_e2e/run_fully_async_policy.sh @@ -114,14 +114,12 @@ common_params=( trainer.val_before_train=False trainer.test_freq=-1 trainer.save_freq=-1 - trainer.total_epochs=2 - trainer.total_training_steps=10 trainer.resume_mode=disable trainer.nnodes=1 trainer.n_gpus_per_node=${n_gpus_training} rollout.nnodes=1 rollout.n_gpus_per_node=${n_gpus_rollout} - rollout.total_rollout_steps=100 + rollout.total_rollout_steps=10 rollout.total_epochs=10 # Fully async specific configurations async_training.staleness_threshold=${staleness_threshold} From d59b734298d25fd80ed914363ae8cd322465c2b3 Mon Sep 17 00:00:00 2001 From: ArronHZG Date: Fri, 8 Aug 2025 18:50:12 +0800 Subject: [PATCH 033/182] readme docs --- recipe/fully_async_policy/README.md | 66 +++++++++++++++++++++++++++++ 1 file changed, 66 insertions(+) create mode 100644 recipe/fully_async_policy/README.md diff --git a/recipe/fully_async_policy/README.md b/recipe/fully_async_policy/README.md new file mode 100644 index 00000000000..0509969216b --- /dev/null +++ b/recipe/fully_async_policy/README.md @@ -0,0 +1,66 @@ +# 基于verl的改造方案 + +## 方案 + +### 方案1 (StreamRL, AsyncFlow) + +![StreamRL]( +https://github.com/ArronHZG/verl-community/blob/recipe/async_policy/docs/StreamRL.png?raw=true) + +在分离架构的基础上,修改在Rollout和Train的样本传递过程中,将离线策略生成一批global样本修改为生成一批batch的方式,实现生成和训练两阶段的高度重叠。 +训练阶段一收到足够样本就开始处理,训练一定步数后,将参数同步到PS侧, Rollout在每次样本生成完成后,check是否有新的参数,如果有就进行一次同步。 + +### 方案2 (Mistralai, Areal) + +![mistralai]( +https://github.com/ArronHZG/verl-community/blob/recipe/async_policy/docs/mistralai.png?raw=true) + +在分离架构的基础上,实现Rollout的partial rollout逻辑。样本仍然修改为batch的方式进行传递,实现生成和训练两阶段的高度重叠。 +在参数同步方面,训练阶段主动触发Rollout的暂停,参数同步以及恢复。 Rollout使用Rollout Server的方式,支持样本生成的中断与恢复, +产生的的样本所使用的参数版本会有所不同。 + +### 折中 + +上述两种方案的核心都是将训练与生成进行overlap,核心区别主要集中在参数同步的处理方式不同,方案1需要实现PS完成参数的异步加载。 +方案2使用同步的方式进行参数同步,但需要完成PartialRollout的逻辑。综合已有代码,以及社区进行中的工作,我们希望先将异步的工作流搭建完成,先以方案1进行开发,后续再进一步开发方案2。 + +## 设计 + +### 架构图 + +![full_async]( +https://github.com/ArronHZG/verl-community/blob/recipe/async_policy/docs/full_async.svg?raw=true) + +为实现纯异步训练工作流,基于已有的 one step off policy 代码,扩增实现 Rollouter 以及 Message Queue,以及对Trainer进行更新。 + +整体的训练流程参考StreamRL,将原有流程中生成 train_batch_size 个样本后进行下一步训练的过程,修改为流式的样本传递,train +拿到一次前向的样本后就进行样本分发(ppo_mini_batch_size*worker)。与one-step-off相比,我们将一次step的异步,继续细化到一次batch的异步。 + +**MessageQueue** 作为Ray的Actor存在,支持zeromq消息队列保存生成的样本,并提供给Trainer使用。Trainer 和 Rollouter 都持有 +MessageQueue 的Handler,通过接口完成样本的插入与消费。 + +**FullyAsyncRollouter** 类似于现有的 Trainer,实现fit()工作流,循环调用 Rollout 进行样本的生成。FullyAsyncRollouter 对于已有的 +vLLMAsyncRollout SGLangAsyncRollout 进行封装。 + +* 方案1,使用异步更新策略,FullyAsyncRollouter 根据样本生成的进展,自动访问PS,判断是否进行新的参数加载。 +* 方案2,参考PR https://github.com/volcengine/verl/pull/2246 https://github.com/volcengine/verl/pull/2200 Rollout + 组件需要支持暂停及恢复,从而进行参数的更新。暂停时,需要保存进行中的rollout样本,下次继续恢复生产。 + +**FullyAsyncTrainer** 与当前实现类似,区别是样本的获取修改为从Queue中获取,Queue有最少batch样本就开始进行分发。rainer完成一次step的训练后, +与FullyAsyncRollouter的使用策略对应: + +* 方案1,使用异步更新策略,参数产生后,主动同步到PS中。 +* 方案2,直接调用Rollouter进行同步,主动通知Rollouter暂停生成,进行参数的同步更新。 + +## 总结 + +当Rollouter生产快于Trainer消费时,queue中会存在多步过期的样本,我们需要在Rollouter中设置“陈旧度 staleness”阈值, +由当前的参数版本以及生成的样本数量,决定是否要暂停生成。zeromq 的最大长度应为 staleness * total_size,并且实现基于陈旧度的拒绝策略,进行防御性编程。 + +* 当使用方案1时,参数的同步由FullyAsyncRollouter主动控制,触发时机取决预先设置的固定数量样本完成以及参数已就绪,产生的样本所使用的参数版本一致, + 但是避免不了长尾的问题,会有"rollout空洞"产生。 + +* 当使用方案2时,参数的同步会更加及时,陈旧度低的样本数量较多,但是长尾样本由不同的参数产生,长尾样本的不同token所对应的参数版本会传递给训练引擎, + 后续可以根据这一信息对loss进行加权处理。 + +当Rollouter生产慢于Trainer消费时,队列长时间为空,基本等价于同步训练。 \ No newline at end of file From 6e5da717c8cb67c2e6e49e70c7bdcedf6e702457 Mon Sep 17 00:00:00 2001 From: ArronHZG Date: Fri, 8 Aug 2025 19:40:28 +0800 Subject: [PATCH 034/182] refactor code --- recipe/fully_async_policy/fully_async_main.py | 55 +- .../fully_async_rollouter.py | 71 +-- .../fully_async_policy/fully_async_trainer.py | 107 ++-- recipe/fully_async_policy/param_sync.py | 20 +- .../unittest/protocol_examples.py | 202 -------- recipe/fully_async_policy/unittest/test_mq.py | 473 ++++++------------ .../fully_async_policy/unittest/test_mq2.py | 171 ------- .../unittest/test_protocol_split_merge.py | 207 +++++++- 8 files changed, 426 insertions(+), 880 deletions(-) delete mode 100644 recipe/fully_async_policy/unittest/protocol_examples.py delete mode 100644 recipe/fully_async_policy/unittest/test_mq2.py diff --git a/recipe/fully_async_policy/fully_async_main.py b/recipe/fully_async_policy/fully_async_main.py index 31541982dd7..6afb44abd9d 100644 --- a/recipe/fully_async_policy/fully_async_main.py +++ b/recipe/fully_async_policy/fully_async_main.py @@ -13,10 +13,8 @@ # limitations under the License. import os -import signal import socket import threading -import time from pprint import pprint import hydra @@ -33,14 +31,14 @@ def create_resource_pool_manager(config, roles: list) -> ResourcePoolManager: """ - 创建资源池管理器 + Create resource pool manager Args: - config: 配置对象 - roles: 需要创建资源池的角色列表 + config: Configuration object + roles: List of roles that need to create resource pools Returns: - ResourcePoolManager: 资源池管理器 + ResourcePoolManager: Resource pool manager """ # 构建资源池规格 resource_pool_spec = {} @@ -73,13 +71,13 @@ def create_resource_pool_manager(config, roles: list) -> ResourcePoolManager: def create_role_worker_mapping(config): """ - 创建角色到worker类的映射 + Create mapping from roles to worker classes Args: - config: 配置对象 + config: Configuration object Returns: - dict: 角色到worker类的映射 + dict: Mapping from roles to worker classes """ # 根据策略选择worker类 if config.actor_rollout_ref.actor.strategy == "fsdp2": @@ -121,7 +119,6 @@ def create_role_worker_mapping(config): Role.Critic: ray.remote(CriticWorker), } - # 添加reward model(如果启用) if config.reward_model.enable: if config.reward_model.strategy == "fsdp2": from verl.workers.fsdp_workers import RewardModelWorker @@ -153,36 +150,23 @@ def __init__(self): def run(self, config): """运行完全异步的PPO训练""" print("Starting fully async PPO training...") - # 初始化基础组件 self._initialize_components(config) - # 启动训练流程 self._run_training_loop() def _initialize_components(self, config) -> None: - """ - 初始化所有组件 - - Args: - config: 配置对象 - - Returns: - bool: 是否初始化成功 - """ - # 打印配置信息 print(f"TaskRunner hostname: {socket.gethostname()}, PID: {os.getpid()}") pprint(OmegaConf.to_container(config, resolve=True)) OmegaConf.resolve(config) - # 初始化模型路径和tokenizer print("Initializing model and tokenizer...") local_path = copy_to_local( config.actor_rollout_ref.model.path, use_shm=config.actor_rollout_ref.model.get("use_shm", False) ) - # Instantiate the tokenizer and processor. from verl.utils import hf_processor, hf_tokenizer trust_remote_code = config.data.get("trust_remote_code", False) tokenizer = hf_tokenizer(local_path, trust_remote_code=trust_remote_code) + # Used for multimodal LLM, could be None processor = hf_processor(local_path, trust_remote_code=trust_remote_code, use_fast=True) @@ -190,13 +174,11 @@ def _initialize_components(self, config) -> None: self.components["processor"] = processor self.components["config"] = config # 保存config以供其他方法使用 - # 创建worker映射和资源池 print("Creating worker mapping and resource pools...") role_worker_mapping, ray_worker_group_cls = create_role_worker_mapping(config) self.components["role_worker_mapping"] = role_worker_mapping self.components["ray_worker_group_cls"] = ray_worker_group_cls - # 创建奖励函数 print("Loading reward functions...") reward_fn = load_reward_manager( config, tokenizer, num_examine=0, **config.reward_model.get("reward_kwargs", {}) @@ -207,12 +189,11 @@ def _initialize_components(self, config) -> None: self.components["reward_fn"] = reward_fn self.components["val_reward_fn"] = val_reward_fn - # 创建MessageQueue self.max_queue_size = ( - config.async_training.staleness_threshold - * config.data.train_batch_size - * config.actor_rollout_ref.rollout.n - ) * 10 # x 10 避免死锁 + config.async_training.staleness_threshold + * config.data.train_batch_size + * config.actor_rollout_ref.rollout.n + ) * 10 # x 10 avoid deadlock print("Creating MessageQueue...") message_queue = MessageQueue.remote(config, self.max_queue_size) message_queue_client = MessageQueueClient(message_queue) @@ -220,15 +201,12 @@ def _initialize_components(self, config) -> None: self.components["message_queue"] = message_queue self.components["message_queue_client"] = message_queue_client - # 创建Rollouter print("Creating FullyAsyncRollouter...") self._create_rollouter(config) - # 创建Trainer print("Creating FullyAsyncTrainer...") self._create_trainer(config) - # 设置参数同步 print("Setting up parameter synchronization...") from recipe.fully_async_policy.param_sync import ParameterSynchronizer @@ -239,18 +217,15 @@ def _initialize_components(self, config) -> None: mq=self.components["message_queue_client"], ) - # 将参数同步器设置到trainer和rollouter ray.get(self.components["trainer"].set_parameter_synchronizer.remote(param_synchronizer)) ray.get(self.components["rollouter"].set_parameter_synchronizer.remote(param_synchronizer)) - # 首先同步一次参数 ray.get(param_synchronizer.sync_weights.remote(0)) self.components["param_synchronizer"] = param_synchronizer print("All components initialized successfully") def _create_rollouter(self, config) -> None: - """创建Rollouter""" pprint(self.components) rollouter = FullyAsyncRollouter.remote( config=config, @@ -269,7 +244,6 @@ def _create_rollouter(self, config) -> None: print("Rollouter created and initialized successfully") def _create_trainer(self, config) -> None: - """创建Trainer""" trainer_role_mapping = { role: worker_cls for role, worker_cls in self.components["role_worker_mapping"].items() @@ -288,14 +262,12 @@ def _create_trainer(self, config) -> None: device_name=config.trainer.device, ) - # 初始化Trainer ray.get(trainer.init_workers.remote()) ray.get(trainer.set_message_queue_client.remote(self.components["message_queue_client"])) self.components["trainer"] = trainer print("FullyAsyncTrainer created and initialized successfully") def _run_training_loop(self): - """运行训练循环""" self.running = True print("Starting Rollouter in background...") @@ -312,10 +284,9 @@ def _run_training_loop(self): @hydra.main(config_path="config", config_name="fully_async_ppo_trainer", version_base=None) def main(config): - """主入口函数""" from verl.trainer.main_ppo import run_ppo - # 确保异步训练配置存在 + # Ensure async training config exists if not hasattr(config, "async_training"): raise RuntimeError("must set async_training config") run_ppo(config, task_runner_class=FullyAsyncTaskRunner) diff --git a/recipe/fully_async_policy/fully_async_rollouter.py b/recipe/fully_async_policy/fully_async_rollouter.py index d392e4a1630..01affa67586 100644 --- a/recipe/fully_async_policy/fully_async_rollouter.py +++ b/recipe/fully_async_policy/fully_async_rollouter.py @@ -29,38 +29,24 @@ @ray.remote(num_cpus=10, max_concurrency=10) class FullyAsyncRollouter(RayPPOTrainer): """ - 异步样本生成器,负责持续生成训练样本并放入MessageQueue - 基于OneStepOffRayTrainer的成熟实现改进 + Asynchronous sample generator, responsible for continuously generating training samples + and putting them into MessageQueue + Based on the mature implementation improvements of OneStepOffRayTrainer """ def __init__( - self, - config, - tokenizer, - role_worker_mapping: dict[Role, WorkerType], - resource_pool_manager: ResourcePoolManager, - ray_worker_group_cls: RayWorkerGroup = RayWorkerGroup, - processor=None, - reward_fn=None, - val_reward_fn=None, - device_name=None, - max_queue_size=1000, + self, + config, + tokenizer, + role_worker_mapping: dict[Role, WorkerType], + resource_pool_manager: ResourcePoolManager, + ray_worker_group_cls: RayWorkerGroup = RayWorkerGroup, + processor=None, + reward_fn=None, + val_reward_fn=None, + device_name=None, + max_queue_size=1000, ): - """ - Initialize distributed PPO trainer with Ray backend. - Note that this trainer runs on the driver process on a single CPU/GPU node. - - Args: - config: Configuration object containing training parameters. - tokenizer: Tokenizer used for encoding and decoding text. - role_worker_mapping (dict[Role, WorkerType]): Mapping from roles to worker classes. - resource_pool_manager (ResourcePoolManager): Manager for Ray resource pools. - ray_worker_group_cls (RayWorkerGroup, optional): Class for Ray worker groups. Defaults to RayWorkerGroup. - processor: Optional data processor, used for multimodal data - reward_fn: Function for computing rewards during training. - val_reward_fn: Function for computing rewards during validation. - device_name (str, optional): Device name for training (e.g., "cuda", "cpu"). Defaults to None. - """ # Store the tokenizer for text processing self.tokenizer = tokenizer self.processor = processor @@ -86,7 +72,7 @@ def __init__( self.use_reference_policy = False self.use_rm = False - # 创建数据集 + # Create datasets print("Creating datasets...") from verl.trainer.main_ppo import create_rl_dataset, create_rl_sampler from verl.utils.dataset.rl_dataset import collate_fn @@ -107,16 +93,16 @@ def __init__( self.total_rollout_steps = total_rollout_steps print(f"Total rollout steps: {self.total_rollout_steps}") - # rollouter 参数配置 + # Rollouter parameter configuration self.message_queue_client = None self.current_param_version = 0 - # 新鲜度控制 - 改进的配置管理 + # Freshness control - improved configuration management async_config = config.async_training self.staleness_threshold = async_config.get("staleness_threshold", 3) - # 统计信息 + # Statistics self.total_generated_samples = 0 self.dropped_stale_samples = 0 self.param_sync_requests = 0 @@ -125,7 +111,7 @@ def __init__( self.rollout_wg = None self.message_queue_client = None - # 并发控制 + # Concurrency control self.running = False self.paused = False self.generation_thread = None @@ -134,48 +120,43 @@ def __init__( self.lock = threading.RLock() self.condition = threading.Condition(self.lock) - # 暂停/恢复统计信息 + # Pause/resume statistics self.pause_count = 0 self.resume_count = 0 self.total_pause_time = 0.0 self.last_pause_time = None - # 参数同步相关 + # Parameter synchronization related self.param_synchronizer = None self.last_sync_time = 0 self.sync_in_progress = False self.sync_lock = threading.Lock() - # 参数同步状态 - 基于one_step_off_policy模式 - self._weights_info = None - self._is_rollout = True # rollouter是rollout角色 - self._is_actor = False - self.max_queue_size = max_queue_size def set_message_queue_client(self, message_queue_client: MessageQueueClient): - """设置消息队列客户端""" + """Set message queue client""" with self.lock: self.message_queue_client = message_queue_client def set_parameter_synchronizer(self, param_synchronizer): - """设置参数同步器""" + """Set parameter synchronizer""" with self.lock: self.param_synchronizer = param_synchronizer def get_rollout_wg(self): - """获取 rollout worker group""" + """Get rollout worker group""" return self.rollout_wg def update_param_version(self, version: int): - """更新当前参数版本""" + """Update current parameter version""" with self.lock: old_version = self.current_param_version self.current_param_version = version print(f"Parameter version updated from {old_version} to {version}") def _validate_config(self): - # 验证异步训练配置 + # Validate asynchronous training configuration if not hasattr(self.config, "async_training"): raise ValueError("Missing async_training configuration") diff --git a/recipe/fully_async_policy/fully_async_trainer.py b/recipe/fully_async_policy/fully_async_trainer.py index 588f5998fe7..39fedb022d5 100644 --- a/recipe/fully_async_policy/fully_async_trainer.py +++ b/recipe/fully_async_policy/fully_async_trainer.py @@ -13,10 +13,8 @@ # limitations under the License. import logging -import threading import time import warnings -from pprint import pprint from typing import Any import numpy as np @@ -46,18 +44,17 @@ class FullyAsyncTrainer(RayPPOTrainer): """ def __init__( - self, - config, - tokenizer, - role_worker_mapping: dict[Role, WorkerType], - resource_pool_manager: ResourcePoolManager, - ray_worker_group_cls: RayWorkerGroup = RayWorkerGroup, - processor=None, - reward_fn=None, - val_reward_fn=None, - device_name=None, + self, + config, + tokenizer, + role_worker_mapping: dict[Role, WorkerType], + resource_pool_manager: ResourcePoolManager, + ray_worker_group_cls: RayWorkerGroup = RayWorkerGroup, + processor=None, + reward_fn=None, + val_reward_fn=None, + device_name=None, ): - # Store the tokenizer for text processing self.tokenizer = tokenizer self.processor = processor @@ -97,43 +94,35 @@ def __init__( self._validate_config() - self.lock = threading.RLock() self.message_queue_client = None self.param_synchronizer = None - # 统计信息 + # Statistics self.processed_samples = 0 self.stale_samples_processed = 0 self.current_param_version = 0 - # 参数同步相关状态 - self._weights_info = None - self._is_actor = False # 将在init_worker_group中设置 - self._is_rollout = False - def set_message_queue_client(self, message_queue_client: MessageQueueClient): - """设置消息队列客户端""" - with self.lock: - self.message_queue_client = message_queue_client + """Set message queue client""" + self.message_queue_client = message_queue_client def set_parameter_synchronizer(self, param_synchronizer): - """设置参数同步器""" - with self.lock: - self.param_synchronizer = param_synchronizer + """Set parameter synchronizer""" + self.param_synchronizer = param_synchronizer def get_actor_wg(self): - """获取 actor worker group""" + """Get actor worker group""" return self.actor_wg def _get_samples_from_queue(self) -> tuple[None, None] | tuple[int, Any]: """ - 从消息队列获取样本并组成gen_batch_output + Get samples from message queue and compose gen_batch_output Returns: tuple: (epoch, batch_dict, gen_batch_output) """ - # 计算需要获取的样本数量 + # Calculate the number of samples needed n_responses_per_prompt = self.config.actor_rollout_ref.rollout.n batch_size = self.config.data.train_batch_size required_samples = n_responses_per_prompt * batch_size @@ -143,7 +132,7 @@ def _get_samples_from_queue(self) -> tuple[None, None] | tuple[int, Any]: flush=True, ) - # 从队列获取样本 + # Get samples from queue consumer_start = time.time() queue_samples = self.message_queue_client.get_samples(min_batch_count=required_samples) consumer_end = time.time() @@ -157,7 +146,7 @@ def _get_samples_from_queue(self) -> tuple[None, None] | tuple[int, Any]: queue_samples = [ray.cloudpickle.loads(x) for x in queue_samples] print(queue_samples) - # 组装 batch + # Assemble batch batch = self._assemble_gen_batch_output_from_queue_samples(queue_samples) print("=" * 200) @@ -167,15 +156,15 @@ def _get_samples_from_queue(self) -> tuple[None, None] | tuple[int, Any]: def _assemble_gen_batch_output_from_queue_samples(self, queue_samples: list[QueueSample]): """ - 从队列样本中组装gen_batch_output + Assemble gen_batch_output from queue samples Args: - queue_samples: 队列中的样本列表 - n_responses_per_prompt: 每个prompt的响应数量 - batch_size: 批次大小 + queue_samples: List of samples from queue + n_responses_per_prompt: Number of responses per prompt + batch_size: Batch size Returns: - DataProto: 组装好的gen_batch_output + DataProto: Assembled gen_batch_output """ import numpy as np @@ -186,7 +175,7 @@ def _assemble_gen_batch_output_from_queue_samples(self, queue_samples: list[Queu print(f"Assembling batch from {len(queue_samples)} queue samples") - # 提取所有样本的数据和元数据 + # Extract data and metadata from all samples sample_data_list = [] rollout_metadata_list = [] timing_info = {} @@ -197,11 +186,11 @@ def _assemble_gen_batch_output_from_queue_samples(self, queue_samples: list[Queu batch = DataProto.from_items(sample_data_list) - # 收集timing信息和metadata + # Collect timing information and metadata param_versions = [] sample_timestamps = [] for metadata in rollout_metadata_list: - # 提取参数版本和时间戳 + # Extract parameter version and timestamp param_versions.append(metadata.get("rollout_param_version", 0)) sample_timestamps.append(metadata.get("generation_timestamp", time.time())) if "timing" in metadata: @@ -210,13 +199,13 @@ def _assemble_gen_batch_output_from_queue_samples(self, queue_samples: list[Queu timing_info[timing_key] = [] # if isinstance(timing_value, (int, float)): # timing_info[timing_key].append(timing_value) - # 计算平均timing + # Calculate average timing avg_timing = {} for key, values in timing_info.items(): if values and len(values) > 0: avg_timing[key] = sum(values) / len(values) - # 创建meta_info + # Create meta_info meta_info = { "timing": avg_timing, "queue_sample_count": len(queue_samples), @@ -287,15 +276,14 @@ def fit(self): # we start from step 1 self.global_steps += 1 - last_val_metrics = None self.max_steps_duration = 0 - # 使用队列模式,不需要传统的dataloader迭代器 - # 初始化获取第一批数据 + # Use queue mode, no need for traditional dataloader iterator + # Initialize to get the first batch of data while True: print("while True", flush=True) - # 检查队列状态 + # Check queue status if self.message_queue_client: queue_stats = self.message_queue_client.get_statistics() print(f"Queue status before getting samples: {queue_stats}") @@ -317,7 +305,6 @@ def fit(self): print("_get_samples_from_queue end") # # 更新统计信息 - # with self.lock: # self.processed_samples += len(batch) if isinstance(batch, list) else 1 # # # 从meta_info中获取参数版本信息 @@ -352,20 +339,17 @@ def fit(self): print("_check_save_checkpoint") self._check_save_checkpoint(is_last_step, timing_raw) - print("_stop_profiling") - # self._stop_profiling(do_profile, timing_raw) print("_collect_metrics") # self._collect_metrics(batch, epoch, metrics, timing_raw) - # 在训练步骤结束后触发参数同步 + # Trigger parameter synchronization after training step print("_trigger_parameter_sync_after_step") - self._trigger_parameter_sync_after_step() print(f"global_steps: {self.global_steps}") self.global_steps += 1 def get_statistics(self) -> dict: - """获取训练统计信息""" + """Get training statistics""" queue_stats = self.message_queue_client.get_statistics() if self.message_queue_client else {} return { "global_steps": self.global_steps, @@ -380,37 +364,40 @@ def get_statistics(self) -> dict: def _trigger_parameter_sync_after_step(self): """ - 在训练步骤结束后触发参数同步 - 这确保rollouter总是使用最新训练的参数 + Trigger parameter synchronization after training step + This ensures rollouter always uses the latest trained parameters """ self.current_param_version = self.current_param_version + 1 print( - f"[TRAINER] Triggering parameter sync after training step {self.global_steps}, version: {self.current_param_version}" + f"[TRAINER] Triggering parameter sync after " + f"training step {self.global_steps}, version: {self.current_param_version}" + ) + logger.info( + f"Triggering parameter sync after training step {self.global_steps}, version: {self.current_param_version}" ) - logger.info(f"Triggering parameter sync after training step {self.global_steps}, version: {self.current_param_version}") ray.get(self.param_synchronizer.sync_weights.remote(self.current_param_version)) def _compute_sample_freshness_metrics(self, batch_samples: list[QueueSample]) -> dict: """ - 计算样本新鲜度指标 + Compute sample freshness metrics Args: - batch_samples: 队列样本列表 + batch_samples: List of queue samples Returns: - dict: 新鲜度指标字典 + dict: Dictionary of freshness metrics """ if not batch_samples: return {} try: - # 提取参数版本和时间戳 + # Extract parameter versions and timestamps sample_ages = [] sample_latencies = [] current_time = time.time() for sample in batch_samples: - # 从rollout_metadata中获取信息 + # Get information from rollout_metadata if hasattr(sample, "rollout_metadata") and sample.rollout_metadata: rollout_version = sample.rollout_metadata.get("rollout_param_version", 0) generation_time = sample.rollout_metadata.get("generation_timestamp", current_time) diff --git a/recipe/fully_async_policy/param_sync.py b/recipe/fully_async_policy/param_sync.py index cb9baa5ff8a..11d94c79ae4 100644 --- a/recipe/fully_async_policy/param_sync.py +++ b/recipe/fully_async_policy/param_sync.py @@ -23,9 +23,9 @@ @ray.remote class ParameterSynchronizer: """ - 统一的参数同步器,负责在actor和rollout之间同步模型参数 - 基于one_step_off_policy的成熟同步模式实现 - 合并了原有的多个同步器类的功能 + Unified parameter synchronizer, responsible for synchronizing model parameters between actor and rollout + Based on the mature synchronization mode implementation of one_step_off_policy + Merges the functions of the original multiple synchronizer classes """ def __init__(self, config, trainer, rollouter, mq): @@ -36,23 +36,23 @@ def __init__(self, config, trainer, rollouter, mq): self.actor_wg = ray.get(trainer.get_actor_wg.remote()) self.rollout_wg = ray.get(rollouter.get_rollout_wg.remote()) - # 基础属性 + # Basic attributes self.weights_info = None self.sync_group_initialized = False self.sync_group_name = "actor_rollout" - # 统计信息 + # Statistics self.current_version = 0 self._init_weights_info() self._init_sync_group() def get_current_param_version(self) -> int: - """获取当前参数版本号""" + """Get current parameter version number""" return self.current_version def get_weights_info(self): - """获取权重信息""" + """Get weights info""" return self.weights_info def _init_weights_info(self): @@ -74,16 +74,16 @@ def sync_weights(self, version): self.current_version = version print(f"Starting weight synchronization (version {self.current_version})...") - print("pause rollout") ray.get(self.rollouter.pause.remote()) - # 更新MQ 版本 + # Update MQ version self.mq_client.update_param_version(version) + # sync weights self.actor_wg.sync_rollout_weights() ray.get(self.rollout_wg.sync_rollout_weights()) - # 更新 rollout 版本 + # Update rollout version ray.get(self.rollouter.update_param_version.remote(version)) ray.get(self.rollouter.resume.remote()) print("sync_weights success") diff --git a/recipe/fully_async_policy/unittest/protocol_examples.py b/recipe/fully_async_policy/unittest/protocol_examples.py deleted file mode 100644 index b695c163c23..00000000000 --- a/recipe/fully_async_policy/unittest/protocol_examples.py +++ /dev/null @@ -1,202 +0,0 @@ -# Copyright 2025 Meituan Ltd. and/or its affiliates -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import numpy as np -import torch - -from verl.protocol import DataProto, DataProtoItem - - -def example_basic_split_merge(): - """Basic example of splitting DataProto into DataProtoItems and merging back.""" - print("=== Basic Split and Merge Example ===") - - # Create sample data - batch_size = 3 - seq_len = 5 - - # Create tensors - input_ids = torch.randint(0, 1000, (batch_size, seq_len)) - attention_mask = torch.ones(batch_size, seq_len) - - # Create non-tensor data - prompts = np.array(["Hello world", "How are you?", "Good morning"], dtype=object) - scores = np.array([0.8, 0.9, 0.7], dtype=object) - - # Create DataProto - data_proto = DataProto.from_dict( - tensors={"input_ids": input_ids, "attention_mask": attention_mask}, - non_tensors={"prompts": prompts, "scores": scores}, - meta_info={"model_name": "test_model", "version": "1.0"}, - ) - - print(f"Original DataProto length: {len(data_proto)}") - print(f"Input IDs shape: {data_proto.batch['input_ids'].shape}") - print(f"Prompts: {data_proto.non_tensor_batch['prompts']}") - - # Split into DataProtoItems - items = data_proto.to_items() - print(f"\nSplit into {len(items)} items") - - for i, item in enumerate(items): - print(f"Item {i}:") - print(f" Input IDs shape: {item.batch['input_ids'].shape}") - print(f" Prompt: {item.non_tensor_batch['prompts']}") - print(f" Score: {item.non_tensor_batch['scores']}") - - # Merge back to DataProto - merged_proto = DataProto.from_items(items) - print(f"\nMerged DataProto length: {len(merged_proto)}") - print(f"Merged Input IDs shape: {merged_proto.batch['input_ids'].shape}") - print(f"Merged prompts: {merged_proto.non_tensor_batch['prompts']}") - - # Verify they're identical - assert torch.equal(data_proto.batch["input_ids"], merged_proto.batch["input_ids"]) - assert torch.equal(data_proto.batch["attention_mask"], merged_proto.batch["attention_mask"]) - assert np.array_equal(data_proto.non_tensor_batch["prompts"], merged_proto.non_tensor_batch["prompts"]) - assert np.array_equal(data_proto.non_tensor_batch["scores"], merged_proto.non_tensor_batch["scores"]) - - print("\n✓ Original and merged DataProto are identical!") - - -def example_item_processing(): - """Example showing individual item processing before merging.""" - print("\n=== Individual Item Processing Example ===") - - # Create initial data - # batch_size = 4 - - values = torch.tensor([1.0, 2.0, 3.0, 4.0]).unsqueeze(1) # Shape: (4, 1) - labels = np.array(["A", "B", "C", "D"], dtype=object) - - original_proto = DataProto.from_dict( - tensors={"values": values}, non_tensors={"labels": labels}, meta_info={"processing_step": 0} - ) - - print(f"Original values: {original_proto.batch['values'].flatten()}") - print(f"Original labels: {original_proto.non_tensor_batch['labels']}") - - # Split and process each item individually - items = original_proto.to_items() - processed_items = [] - - for i, item in enumerate(items): - # Process the tensor data (multiply by 2) - processed_value = item.batch["values"] * 2 - - # Process the non-tensor data (add suffix) - processed_label = item.non_tensor_batch["labels"] + f"_processed_{i}" - - # Create new processed item - processed_item = DataProtoItem( - batch=item.batch.clone(), # Clone the TensorDict - non_tensor_batch=item.non_tensor_batch.copy(), - meta_info=item.meta_info.copy(), - ) - - # Update with processed data - processed_item.batch["values"] = processed_value - processed_item.non_tensor_batch["labels"] = processed_label - processed_item.meta_info["processing_step"] = 1 - - processed_items.append(processed_item) - - print(f"Processed item {i}: value={processed_value.item()}, label='{processed_label}'") - - # Merge processed items back - processed_proto = DataProto.from_items(processed_items) - - print(f"\nProcessed values: {processed_proto.batch['values'].flatten()}") - print(f"Processed labels: {processed_proto.non_tensor_batch['labels']}") - print(f"Processing step: {processed_proto.meta_info['processing_step']}") - - -def example_convenience_methods(): - """Example showing convenience methods.""" - print("\n=== Convenience Methods Example ===") - - # Create a single DataProtoItem - single_tensor = torch.tensor([42]).unsqueeze(0) # Shape: (1,) - single_item = DataProtoItem( - batch=None, # We'll create TensorDict manually - non_tensor_batch={"text": "Hello"}, - meta_info={"source": "manual"}, - ) - - # Create TensorDict manually for the single item - from tensordict import TensorDict - - single_item.batch = TensorDict({"data": single_tensor}, batch_size=(1,)) - - print(f"Single item data: {single_item.batch['data']}") - print(f"Single item text: {single_item.non_tensor_batch['text']}") - - # Convert single item to DataProto using convenience method - single_proto = single_item.to_proto() - print(f"Converted to DataProto length: {len(single_proto)}") - - # Create multiple items and use static convenience method - items = [single_item] - for i in range(2): - new_item = single_item.copy() # Use the copy method - new_item.batch["data"] = torch.tensor([100 + i]).unsqueeze(0) - new_item.non_tensor_batch["text"] = f"Item {i + 1}" - items.append(new_item) - - # Use DataProtoItem.from_items() convenience method - merged_proto = DataProtoItem.from_items(items) - print(f"Merged using convenience method - length: {len(merged_proto)}") - print(f"Data: {merged_proto.batch['data'].flatten()}") - print(f"Texts: {merged_proto.non_tensor_batch['text']}") - - -def example_error_handling(): - """Example showing error handling.""" - print("\n=== Error Handling Example ===") - - # Try to create DataProto from empty list - try: - DataProto.from_items([]) - print("ERROR: Should have raised exception for empty list") - except ValueError as e: - print(f"✓ Correctly caught error for empty list: {e}") - - # Try to merge items with inconsistent structure - try: - item1 = DataProtoItem( - batch=TensorDict({"data": torch.tensor([1]).unsqueeze(0)}, batch_size=(1,)), - non_tensor_batch={"text": "Hello"}, - ) - item2 = DataProtoItem( - batch=TensorDict({"different_key": torch.tensor([2]).unsqueeze(0)}, batch_size=(1,)), - non_tensor_batch={"text": "World"}, - ) - - DataProto.from_items([item1, item2]) - print("ERROR: Should have raised exception for inconsistent structure") - except ValueError as e: - print(f"✓ Correctly caught error for inconsistent structure: {e}") - - -if __name__ == "__main__": - # Import tensordict for the examples - from tensordict import TensorDict - - # Run all examples - example_basic_split_merge() - example_item_processing() - example_convenience_methods() - example_error_handling() - - print("\n🎉 All examples completed successfully!") diff --git a/recipe/fully_async_policy/unittest/test_mq.py b/recipe/fully_async_policy/unittest/test_mq.py index b766c60f858..7af4945f311 100644 --- a/recipe/fully_async_policy/unittest/test_mq.py +++ b/recipe/fully_async_policy/unittest/test_mq.py @@ -11,6 +11,7 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. + import threading import time from unittest.mock import Mock @@ -19,30 +20,30 @@ import ray from omegaconf import DictConfig -from recipe.fully_async_policy.message_queue import MessageQueue, MessageQueueClient, QueueSample +from recipe.fully_async_policy.message_queue import MessageQueue, MessageQueueClient @pytest.fixture -def mock_data_proto(): - """Mock数据对象""" +def mock_sample(): + """Mock sample data object""" return Mock() @pytest.fixture def basic_config(): - """基础配置""" + """Basic configuration""" return DictConfig({"async_training": {"staleness_threshold": 3}}) @pytest.fixture def queue_config(): - """队列配置""" + """Queue configuration with different staleness threshold""" return DictConfig({"async_training": {"staleness_threshold": 2}}) @pytest.fixture def ray_setup(): - """设置Ray环境""" + """Setup Ray environment""" if not ray.is_initialized(): ray.init(local_mode=True, ignore_reinit_error=True) yield @@ -51,7 +52,7 @@ def ray_setup(): @pytest.fixture def message_queue_client(ray_setup, basic_config): - """创建MessageQueue actor并返回其客户端""" + """Create MessageQueue actor and return its client""" actor = MessageQueue.remote(basic_config, max_queue_size=10) client = MessageQueueClient(actor) yield client @@ -59,125 +60,110 @@ def message_queue_client(ray_setup, basic_config): class TestMessageQueue: - """测试MessageQueue(通过MessageQueueClient)""" - - def test_put_samples_success(self, message_queue_client, mock_data_proto): - """测试成功放入samples""" - samples = [mock_data_proto, mock_data_proto] - metadata_list = [{"test": "data1"}, {"test": "data2"}] - - result = message_queue_client.put_sample(sample=samples, param_version=1, rollout_metadata=metadata_list) + """Test MessageQueue (through MessageQueueClient)""" + def test_put_sample_success(self, message_queue_client, mock_sample): + """Test successfully putting a sample""" + result = message_queue_client.put_sample(sample=mock_sample, param_version=1) assert result is True - # 检查队列大小 + # Check queue size queue_size = message_queue_client.get_queue_size() - assert queue_size == 2 + assert queue_size == 1 - # 检查统计信息 + # Check statistics stats = message_queue_client.get_statistics() - assert stats["total_produced"] == 2 - assert stats["queue_size"] == 2 + assert stats["total_produced"] == 1 + assert stats["queue_size"] == 1 - def test_put_samples_without_metadata(self, message_queue_client, mock_data_proto): - """测试不提供metadata时的处理""" - samples = [mock_data_proto, mock_data_proto] + def test_put_multiple_samples(self, message_queue_client, mock_sample): + """Test putting multiple samples""" + for i in range(3): + result = message_queue_client.put_sample(sample=mock_sample, param_version=1) + assert result is True - result = message_queue_client.put_sample(sample=samples, param_version=1, rollout_metadata=None) - - assert result is True + # Check queue size queue_size = message_queue_client.get_queue_size() - assert queue_size == 2 - - def test_put_samples_metadata_mismatch(self, message_queue_client, mock_data_proto): - """测试metadata长度不匹配的处理""" - samples = [mock_data_proto, mock_data_proto] - metadata_list = [{"test": "data1"}] # 长度不匹配 + assert queue_size == 3 - result = message_queue_client.put_sample(sample=samples, param_version=1, rollout_metadata=metadata_list) - - assert result is False # 应该失败 - queue_size = message_queue_client.get_queue_size() - assert queue_size == 0 + # Check statistics + stats = message_queue_client.get_statistics() + assert stats["total_produced"] == 3 + assert stats["queue_size"] == 3 - def test_put_samples_staleness_check(self, message_queue_client, mock_data_proto): - """测试新鲜度检查""" - # 更新参数版本为5 + def test_put_sample_staleness_check(self, message_queue_client, mock_sample): + """Test freshness check when putting samples""" + # Update parameter version to 5 message_queue_client.update_param_version(5) - # 尝试放入版本过旧的batch(版本差异>=3会被拒绝) - samples = [mock_data_proto] + # Try to put a stale sample (version difference >= 3 will be rejected) result = message_queue_client.put_sample( - sample=samples, - param_version=2, # 5-2=3, 达到阈值 - rollout_metadata=None, + sample=mock_sample, + param_version=2, # 5-2=3, reaches threshold ) assert result is False - # 检查统计信息中的丢弃样本数 + # Check dropped samples count in statistics stats = message_queue_client.get_statistics() assert stats["dropped_samples"] == 1 - def test_put_samples_queue_overflow(self, message_queue_client, mock_data_proto): - """测试队列溢出处理""" - # 填满队列(最大容量10) - for i in range(6): # 每次放入2个,总共12个,超过最大容量10 - samples = [mock_data_proto, mock_data_proto] - message_queue_client.put_sample(sample=samples, param_version=1, rollout_metadata=None) + def test_put_sample_queue_overflow(self, message_queue_client, mock_sample): + """Test queue overflow handling""" + # Fill the queue (max capacity 10) + for i in range(12): # Put 12 samples, exceeding max capacity 10 + message_queue_client.put_sample(sample=mock_sample, param_version=1) - # 队列大小应该保持在最大值 + # Queue size should stay at maximum value queue_size = message_queue_client.get_queue_size() assert queue_size == 10 - # 检查统计信息 + # Check statistics stats = message_queue_client.get_statistics() - assert stats["dropped_samples"] == 2 # 超出的2个被丢弃 + assert stats["dropped_samples"] == 2 # 2 samples should be dropped - def test_get_samples_success(self, message_queue_client, mock_data_proto): - """测试成功获取samples""" - # 先放入一些samples - samples = [mock_data_proto, mock_data_proto, mock_data_proto] - metadata_list = [{"index": 0}, {"index": 1}, {"index": 2}] - message_queue_client.put_sample(sample=samples, param_version=1, rollout_metadata=metadata_list) + def test_get_samples_success(self, message_queue_client, mock_sample): + """Test successfully getting samples""" + # First put some samples + for i in range(3): + message_queue_client.put_sample(sample=mock_sample, param_version=1) - # 获取2个samples + # Get 2 samples retrieved_samples = message_queue_client.get_samples(min_batch_count=2) assert retrieved_samples is not None assert len(retrieved_samples) == 2 - assert all(isinstance(sample, QueueSample) for sample in retrieved_samples) - # 检查队列大小减少 + # Check queue size decreased queue_size = message_queue_client.get_queue_size() assert queue_size == 1 - # 检查统计信息 + # Check statistics stats = message_queue_client.get_statistics() assert stats["total_consumed"] == 2 - def test_get_samples_blocking_behavior(self, message_queue_client, mock_data_proto): - """测试阻塞行为""" + def test_get_samples_blocking_behavior(self, message_queue_client, mock_sample): + """Test blocking behavior""" result = [] def get_samples(): - # 这会阻塞直到有足够样本 + # This will block until enough samples are available samples = message_queue_client.get_samples(min_batch_count=2) result.append(samples) def put_samples_later(): - time.sleep(0.5) # 延迟放入 - samples = [mock_data_proto, mock_data_proto] - message_queue_client.put_sample(sample=samples, param_version=1, rollout_metadata=None) + time.sleep(0.5) # Delay putting samples + message_queue_client.put_sample(sample=mock_sample, param_version=1) + message_queue_client.put_sample(sample=mock_sample, param_version=1) - # 启动消费者线程 + # Start consumer thread consumer_thread = threading.Thread(target=get_samples) producer_thread = threading.Thread(target=put_samples_later) consumer_thread.start() producer_thread.start() - # 等待两个线程完成 + # Wait for both threads to complete producer_thread.join(timeout=2) consumer_thread.join(timeout=2) @@ -185,34 +171,33 @@ def put_samples_later(): assert len(result[0]) == 2 def test_update_param_version(self, message_queue_client): - """测试更新参数版本""" + """Test updating parameter version""" message_queue_client.update_param_version(10) stats = message_queue_client.get_statistics() assert stats["current_param_version"] == 10 - def test_clear_queue(self, message_queue_client, mock_data_proto): - """测试清空队列""" - # 先添加一些样本 - samples = [mock_data_proto, mock_data_proto, mock_data_proto] - message_queue_client.put_sample(sample=samples, param_version=1, rollout_metadata=None) + def test_clear_queue(self, message_queue_client, mock_sample): + """Test clearing the queue""" + # First add some samples + for i in range(3): + message_queue_client.put_sample(sample=mock_sample, param_version=1) - # 清空队列 + # Clear the queue message_queue_client.clear_queue() - # 检查队列大小 + # Check queue size queue_size = message_queue_client.get_queue_size() assert queue_size == 0 - def test_get_queue_size(self, message_queue_client, mock_data_proto): - """测试获取队列大小""" + def test_get_queue_size(self, message_queue_client, mock_sample): + """Test getting queue size""" assert message_queue_client.get_queue_size() == 0 - samples = [mock_data_proto] - message_queue_client.put_sample(sample=samples, param_version=1, rollout_metadata=None) + message_queue_client.put_sample(sample=mock_sample, param_version=1) assert message_queue_client.get_queue_size() == 1 def test_get_statistics(self, message_queue_client): - """测试获取统计信息""" + """Test getting statistics""" stats = message_queue_client.get_statistics() expected_keys = { @@ -229,11 +214,11 @@ def test_get_statistics(self, message_queue_client): assert isinstance(stats["total_produced"], int) assert isinstance(stats["total_consumed"], int) - def test_get_memory_usage(self, message_queue_client, mock_data_proto): - """测试获取内存使用统计""" - # 添加一些样本 - samples = [mock_data_proto, mock_data_proto] - message_queue_client.put_sample(sample=samples, param_version=1, rollout_metadata=None) + def test_get_memory_usage(self, message_queue_client, mock_sample): + """Test getting memory usage statistics""" + # Add some samples + for i in range(2): + message_queue_client.put_sample(sample=mock_sample, param_version=1) memory_stats = message_queue_client.get_memory_usage() @@ -244,44 +229,44 @@ def test_get_memory_usage(self, message_queue_client, mock_data_proto): assert memory_stats["estimated_memory_mb"] > 0 def test_shutdown(self, ray_setup, basic_config): - """测试关闭功能""" - # 创建新的actor用于测试关闭 + """Test shutdown functionality""" + # Create new actor for testing shutdown actor = MessageQueue.remote(basic_config, max_queue_size=10) client = MessageQueueClient(actor) - # 关闭应该不抛出异常 + # Shutdown should not throw exceptions client.shutdown() class TestConcurrency: - """测试并发场景""" + """Test concurrent scenarios""" def setup_method(self): - """每个测试方法前的设置""" + """Setup before each test method""" if not ray.is_initialized(): ray.init(local_mode=True, ignore_reinit_error=True) def teardown_method(self): - """每个测试方法后的清理""" + """Cleanup after each test method""" if ray.is_initialized(): ray.shutdown() def create_message_queue_client(self, config=None): - """创建MessageQueue client的辅助方法""" + """Helper method to create MessageQueue client""" if config is None: config = DictConfig({"async_training": {"staleness_threshold": 3}}) actor = MessageQueue.remote(config, max_queue_size=10) return MessageQueueClient(actor) - def test_concurrent_put_get(self, mock_data_proto): - """测试并发放入和获取""" + def test_concurrent_put_get(self, mock_sample): + """Test concurrent put and get""" client = self.create_message_queue_client() try: results = [] def producer(): for i in range(50): - samples = [mock_data_proto, mock_data_proto] + samples = [mock_sample, mock_sample] result = client.put_sample(sample=samples, param_version=1, rollout_metadata=None) results.append(("put", result)) time.sleep(0.1) @@ -296,7 +281,7 @@ def consumer(): results.append(("get", False)) time.sleep(0.1) - # 启动生产者和消费者线程 + # Start producer and consumer threads producer_thread = threading.Thread(target=producer) consumer_thread = threading.Thread(target=consumer) @@ -307,7 +292,7 @@ def consumer(): producer_thread.join(timeout=5) consumer_thread.join(timeout=5) - # 检查结果 + # Check results put_results = [r[1] for r in results if r[0] == "put"] get_results = [r[1] for r in results if r[0] == "get"] @@ -317,245 +302,85 @@ def consumer(): client.shutdown() def test_consume_first_produce_later(self, message_queue_client, mock_data_proto): - """测试先消费、后生产的场景 - 验证阻塞和唤醒机制""" + """Test consume first, produce later scenario - verify blocking and wake-up mechanism""" consumer_result = [] producer_result = [] - start_time = time.time() def consumer_task(): - """消费者任务:先启动,等待生产者生产数据""" - try: - # 记录开始消费的时间 - consumer_start = time.time() - # 这里会阻塞等待,直到有至少2个样本可用 - samples = message_queue_client.get_samples(min_batch_count=2) - consumer_end = time.time() - - consumer_result.append( - { - "success": True, - "samples_count": len(samples), - "wait_time": consumer_end - consumer_start, - "samples": samples, - } - ) - except Exception as e: - consumer_result.append({"success": False, "error": str(e), "wait_time": time.time() - consumer_start}) + """Consumer task: start first, wait for producer to generate data""" + # Record the start time of consumption + consumer_start = time.time() + # This will block until at least 3 samples are available + samples = message_queue_client.get_samples(min_batch_count=3) + consumer_end = time.time() + consumer_result.append( + { + "success": True, + "samples_count": len(samples), + "wait_time": consumer_end - consumer_start, + "samples": samples, + } + ) def producer_task(): - """生产者任务:延迟1秒后开始生产""" - try: - # 延迟1秒,确保消费者先开始等待 - time.sleep(1.0) - producer_start = time.time() - - # 分两次放入,验证消费者会等到足够的样本数量 - samples_1 = mock_data_proto - result1 = message_queue_client.put_sample( - sample=samples_1, param_version=1, rollout_metadata=[{"batch": "first"}] - ) - - # 短暂延迟后放入第二批 - time.sleep(0.1) - samples_2 = mock_data_proto - result2 = message_queue_client.put_sample( - sample=samples_2, param_version=1, rollout_metadata=[{"batch": "second"}] - ) - - samples_2 = mock_data_proto - result3 = message_queue_client.put_sample( - sample=samples_2, param_version=1, rollout_metadata=[{"batch": "second"}] - ) - - producer_end = time.time() - producer_result.append( - { - "success": result1 and result2, - "put_count": 2, - "produce_time": producer_end - producer_start, - "result1": result1, - "result2": result2, - } - ) - - print("produce finish") - - except Exception as e: - producer_result.append({"success": False, "error": str(e)}) - - # 启动消费者线程(先启动) + """Producer task: start producing after a delay""" + time.sleep(4.0) + producer_start = time.time() + message_queue_client.put_sample( + sample=mock_data_proto, + param_version=1, + ) + time.sleep(1) + message_queue_client.put_sample( + sample=mock_data_proto, + param_version=1, + ) + time.sleep(1) + message_queue_client.put_sample( + sample=mock_data_proto, + param_version=1, + ) + producer_end = time.time() + producer_result.append( + { + "put_count": 3, + "produce_time": producer_end - producer_start, + } + ) + + print("produce finish") + + # Start consumer thread (first) consumer_thread = threading.Thread(target=consumer_task, name="Consumer") - # 启动生产者线程(后启动) + time.sleep(3) + # Start producer thread (later) producer_thread = threading.Thread(target=producer_task, name="Producer") consumer_thread.start() - time.sleep(0.1) # 确保消费者先开始等待 + time.sleep(0.1) producer_thread.start() - print("=========") - # - # # 等待两个线程完成(设置超时避免死锁) - producer_thread.join() - # print("producer_result", producer_result) - # consumer_thread.join() - # print("consumer_thread", consumer_result) - # - # total_time = time.time() - start_time - # - # # 验证结果 - # assert len(consumer_result) == 1, "消费者应该执行一次" - # - # consumer_data = consumer_result[0] - # producer_data = producer_result[0] - # - # # 验证生产者成功 - # assert producer_data['success'], f"生产者失败: {producer_data.get('error', '')}" - # assert producer_data['put_count'] == 2, "应该生产2批数据" - # - # # 验证消费者成功 - # assert consumer_data['success'], f"消费者失败: {consumer_data.get('error', '')}" - # assert consumer_data['samples_count'] == 2, "消费者应该获取到2个样本" - # - # # 验证时序:消费者等待时间应该大于1秒(生产者的延迟时间) - # assert consumer_data['wait_time'] >= 1.0, f"消费者等待时间应该≥1秒,实际: {consumer_data['wait_time']:.2f}秒" - # - # # 验证数据完整性 - # assert all(isinstance(sample, QueueSample) for sample in consumer_data['samples']), "获取的样本应该是QueueSample类型" - # - # # 验证队列状态 - # final_queue_size = message_queue_client.get_queue_size() - # assert final_queue_size == 0, "队列应该被清空" - # - # stats = message_queue_client.get_statistics() - # assert stats['total_produced'] == 2, "应该生产了2个样本" - # assert stats['total_consumed'] == 2, "应该消费了2个样本" - # - # print(f"测试成功完成,总耗时: {total_time:.2f}秒") - # print(f"消费者等待时间: {consumer_data['wait_time']:.2f}秒") - # print(f"生产者执行时间: {producer_data['produce_time']:.2f}秒") - - def test_multiple_consumers_single_producer(self, message_queue_client, mock_data_proto): - """测试多个消费者等待单个生产者的场景""" - consumer_results = [] - producer_result = [] - - def consumer_task(consumer_id): - """消费者任务""" - try: - start_time = time.time() - samples = message_queue_client.get_samples(min_batch_count=1) - end_time = time.time() - - consumer_results.append( - { - "id": consumer_id, - "success": True, - "samples_count": len(samples), - "wait_time": end_time - start_time, - } - ) - except Exception as e: - consumer_results.append({"id": consumer_id, "success": False, "error": str(e)}) - - def producer_task(): - """生产者任务:延迟后批量生产""" - try: - time.sleep(1.5) # 确保所有消费者都在等待 - - # 生产3批数据,每批1个样本,供3个消费者消费 - for i in range(3): - samples = [mock_data_proto] - result = message_queue_client.put_sample( - sample=samples, param_version=1, rollout_metadata=[{"batch_id": i}] - ) - producer_result.append(result) - time.sleep(0.1) # 短暂间隔 - - except Exception as e: - producer_result.append(False) - - print("# 启动3个消费者线程") - # consumer_threads = [] - # for i in range(3): - # thread = threading.Thread(target=consumer_task, args=(i,), name=f"Consumer-{i}") - # consumer_threads.append(thread) - # thread.start() - # time.sleep(0.1) # 错开启动时间 - # - # # 启动生产者线程 - # producer_thread = threading.Thread(target=producer_task, name="Producer") - # producer_thread.start() - # - # # 等待所有线程完成 - # producer_thread.join(timeout=10) - # for thread in consumer_threads: - # thread.join(timeout=10) - # - # # 验证结果 - # assert len(consumer_results) == 3, "应该有3个消费者结果" - # assert len(producer_result) == 3, "应该生产3批数据" - # - # # 验证所有消费者都成功 - # for result in consumer_results: - # assert result['success'], f"消费者{result['id']}失败: {result.get('error', '')}" - # assert result['samples_count'] == 1, f"消费者{result['id']}应该获取1个样本" - # assert result['wait_time'] >= 1.5, f"消费者{result['id']}等待时间应该≥1.5秒" - # - # # 验证生产者都成功 - # assert all(producer_result), "所有生产操作都应该成功" - # - # # 验证最终状态 - # final_stats = message_queue_client.get_statistics() - # assert final_stats['total_produced'] == 3, "应该总共生产3个样本" - # assert final_stats['total_consumed'] == 3, "应该总共消费3个样本" - # assert final_stats['queue_size'] == 0, "队列应该被清空" - - def test_consumer_timeout_scenario(self, message_queue_client, mock_data_proto): - """测试消费者超时场景(通过关闭队列来模拟)""" - consumer_result = [] - - def consumer_task(): - """消费者任务:等待样本""" - try: - start_time = time.time() - # 尝试获取样本,但没有生产者会生产数据 - samples = message_queue_client.get_samples(min_batch_count=2) - end_time = time.time() - - consumer_result.append( - {"success": True, "samples_count": len(samples), "wait_time": end_time - start_time} - ) - except Exception as e: - consumer_result.append({"success": False, "error": str(e)}) - - def shutdown_task(): - """延迟关闭队列,模拟超时场景""" - time.sleep(2.0) # 让消费者等待2秒 - message_queue_client.shutdown() - - # 启动消费者和关闭任务 - consumer_thread = threading.Thread(target=consumer_task, name="Consumer") - shutdown_thread = threading.Thread(target=shutdown_task, name="Shutdown") + print("=========", flush=True) - consumer_thread.start() - time.sleep(0.1) - shutdown_thread.start() + producer_thread.join() + print("producer_result", producer_result, flush=True) + consumer_thread.join() + print("consumer_result", consumer_result, flush=True) - # 等待线程完成 - shutdown_thread.join(timeout=5) - consumer_thread.join(timeout=5) + assert len(consumer_result) == 1, "消费者应该执行一次" - # 验证结果 - assert len(consumer_result) == 1, "应该有一个消费者结果" + consumer_data = consumer_result[0] + producer_data = producer_result[0] - result = consumer_result[0] - # 消费者应该在队列关闭后返回空列表 - if result["success"]: - assert result["samples_count"] == 0, "关闭后应该返回空样本列表" + assert producer_data["put_count"] == 3 + assert consumer_data["samples_count"] == 3 - print(f"消费者等待了 {result.get('wait_time', 0):.2f} 秒后退出") + final_queue_size = message_queue_client.get_queue_size() + assert final_queue_size == 0 - # 运行测试的示例配置 + stats = message_queue_client.get_statistics() + assert stats["total_produced"] == 3 + assert stats["total_consumed"] == 3 if __name__ == "__main__": diff --git a/recipe/fully_async_policy/unittest/test_mq2.py b/recipe/fully_async_policy/unittest/test_mq2.py deleted file mode 100644 index d846a16dcb7..00000000000 --- a/recipe/fully_async_policy/unittest/test_mq2.py +++ /dev/null @@ -1,171 +0,0 @@ -# Copyright 2025 Meituan Ltd. and/or its affiliates -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -import threading -import time -from unittest.mock import Mock - -import pytest -import ray -from omegaconf import DictConfig - -from recipe.fully_async_policy.message_queue import MessageQueue, MessageQueueClient, QueueSample - - -@pytest.fixture -def mock_data_proto(): - """Mock数据对象""" - return Mock() - - -@pytest.fixture -def basic_config(): - """基础配置""" - return DictConfig({"async_training": {"staleness_threshold": 3}}) - - -@pytest.fixture -def queue_config(): - """队列配置""" - return DictConfig({"async_training": {"staleness_threshold": 2}}) - - -@pytest.fixture -def ray_setup(): - """设置Ray环境""" - if not ray.is_initialized(): - ray.init(local_mode=True, ignore_reinit_error=True) - yield - ray.shutdown() - - -@pytest.fixture -def message_queue_client(ray_setup, basic_config): - """创建MessageQueue actor并返回其客户端""" - actor = MessageQueue.remote(basic_config, max_queue_size=10) - client = MessageQueueClient(actor) - yield client - client.shutdown() - - -class TestConcurrency: - """测试并发场景""" - - def setup_method(self): - """每个测试方法前的设置""" - if not ray.is_initialized(): - ray.init() - - def teardown_method(self): - """每个测试方法后的清理""" - if ray.is_initialized(): - ray.shutdown() - - def create_message_queue_client(self, config=None): - """创建MessageQueue client的辅助方法""" - if config is None: - config = DictConfig({"async_training": {"staleness_threshold": 3}}) - actor = MessageQueue.remote(config, max_queue_size=10) - return MessageQueueClient(actor) - - def test_consume_first_produce_later(self, message_queue_client, mock_data_proto): - """测试先消费、后生产的场景 - 验证阻塞和唤醒机制""" - consumer_result = [] - producer_result = [] - start_time = time.time() - - def consumer_task(): - """消费者任务:先启动,等待生产者生产数据""" - # 记录开始消费的时间 - consumer_start = time.time() - # 这里会阻塞等待,直到有至少2个样本可用 - samples = message_queue_client.get_samples(min_batch_count=3) - consumer_end = time.time() - consumer_result.append( - { - "success": True, - "samples_count": len(samples), - "wait_time": consumer_end - consumer_start, - "samples": samples, - } - ) - - def producer_task(): - """生产者任务:延迟1秒后开始生产""" - time.sleep(4.0) - producer_start = time.time() - message_queue_client.put_sample( - sample=mock_data_proto, - param_version=1, - ) - time.sleep(1) - message_queue_client.put_sample( - sample=mock_data_proto, - param_version=1, - ) - time.sleep(1) - message_queue_client.put_sample( - sample=mock_data_proto, - param_version=1, - ) - producer_end = time.time() - producer_result.append( - { - "put_count": 3, - "produce_time": producer_end - producer_start, - } - ) - - print("produce finish") - - # 启动消费者线程(先启动) - consumer_thread = threading.Thread(target=consumer_task, name="Consumer") - time.sleep(3) - # 启动生产者线程(后启动) - producer_thread = threading.Thread(target=producer_task, name="Producer") - - consumer_thread.start() - time.sleep(0.1) # 确保消费者先开始等待 - producer_thread.start() - - print("=========", flush=True) - # - # # 等待两个线程完成(设置超时避免死锁) - producer_thread.join() - print("producer_result", producer_result, flush=True) - consumer_thread.join() - print("consumer_result", consumer_result, flush=True) - - # 验证结果 - assert len(consumer_result) == 1, "消费者应该执行一次" - - consumer_data = consumer_result[0] - producer_data = producer_result[0] - - # 验证生产者成功 - assert producer_data["put_count"] == 3, "应该生产2批数据" - assert consumer_data["samples_count"] == 3, "消费者应该获取到2个样本" - - # 验证队列状态 - final_queue_size = message_queue_client.get_queue_size() - assert final_queue_size == 0, "队列应该被清空" - - stats = message_queue_client.get_statistics() - assert stats["total_produced"] == 3, "应该生产了2个样本" - assert stats["total_consumed"] == 3, "应该消费了2个样本" - # - - -# 运行测试的示例配置 -if __name__ == "__main__": - pytest.main([__file__, "-v", "--tb=short"]) diff --git a/recipe/fully_async_policy/unittest/test_protocol_split_merge.py b/recipe/fully_async_policy/unittest/test_protocol_split_merge.py index 7c959a791bb..a5c61f11ba6 100644 --- a/recipe/fully_async_policy/unittest/test_protocol_split_merge.py +++ b/recipe/fully_async_policy/unittest/test_protocol_split_merge.py @@ -16,7 +16,7 @@ import torch from tensordict import TensorDict -from verl.protocol import DataProto +from verl.protocol import DataProto, DataProtoItem def create_sample_dataproto(): @@ -435,32 +435,187 @@ def run_visual_comparison(): return success -if __name__ == "__main__": - print("Testing DataProto Split/Merge Functionality") - print("=" * 60) +def example_basic_split_merge(): + """Basic example of splitting DataProto into DataProtoItems and merging back.""" + print("=== Basic Split and Merge Example ===") + + # Create sample data + batch_size = 3 + seq_len = 5 + + # Create tensors + input_ids = torch.randint(0, 1000, (batch_size, seq_len)) + attention_mask = torch.ones(batch_size, seq_len) + + # Create non-tensor data + prompts = np.array(["Hello world", "How are you?", "Good morning"], dtype=object) + scores = np.array([0.8, 0.9, 0.7], dtype=object) + + # Create DataProto + data_proto = DataProto.from_dict( + tensors={"input_ids": input_ids, "attention_mask": attention_mask}, + non_tensors={"prompts": prompts, "scores": scores}, + meta_info={"model_name": "test_model", "version": "1.0"}, + ) + + print(f"Original DataProto length: {len(data_proto)}") + print(f"Input IDs shape: {data_proto.batch['input_ids'].shape}") + print(f"Prompts: {data_proto.non_tensor_batch['prompts']}") + + # Split into DataProtoItems + items = data_proto.to_items() + print(f"\nSplit into {len(items)} items") + + for i, item in enumerate(items): + print(f"Item {i}:") + print(f" Input IDs shape: {item.batch['input_ids'].shape}") + print(f" Prompt: {item.non_tensor_batch['prompts']}") + print(f" Score: {item.non_tensor_batch['scores']}") + + # Merge back to DataProto + merged_proto = DataProto.from_items(items) + print(f"\nMerged DataProto length: {len(merged_proto)}") + print(f"Merged Input IDs shape: {merged_proto.batch['input_ids'].shape}") + print(f"Merged prompts: {merged_proto.non_tensor_batch['prompts']}") + + # Verify they're identical + assert torch.equal(data_proto.batch["input_ids"], merged_proto.batch["input_ids"]) + assert torch.equal(data_proto.batch["attention_mask"], merged_proto.batch["attention_mask"]) + assert np.array_equal(data_proto.non_tensor_batch["prompts"], merged_proto.non_tensor_batch["prompts"]) + assert np.array_equal(data_proto.non_tensor_batch["scores"], merged_proto.non_tensor_batch["scores"]) + + print("\n✓ Original and merged DataProto are identical!") + + +def example_item_processing(): + """Example showing individual item processing before merging.""" + print("\n=== Individual Item Processing Example ===") + + # Create initial data + # batch_size = 4 + + values = torch.tensor([1.0, 2.0, 3.0, 4.0]).unsqueeze(1) # Shape: (4, 1) + labels = np.array(["A", "B", "C", "D"], dtype=object) + + original_proto = DataProto.from_dict( + tensors={"values": values}, non_tensors={"labels": labels}, meta_info={"processing_step": 0} + ) + + print(f"Original values: {original_proto.batch['values'].flatten()}") + print(f"Original labels: {original_proto.non_tensor_batch['labels']}") + + # Split and process each item individually + items = original_proto.to_items() + processed_items = [] + + for i, item in enumerate(items): + # Process the tensor data (multiply by 2) + processed_value = item.batch["values"] * 2 + + # Process the non-tensor data (add suffix) + processed_label = item.non_tensor_batch["labels"] + f"_processed_{i}" + # Create new processed item + processed_item = DataProtoItem( + batch=item.batch.clone(), # Clone the TensorDict + non_tensor_batch=item.non_tensor_batch.copy(), + meta_info=item.meta_info.copy(), + ) + + # Update with processed data + processed_item.batch["values"] = processed_value + processed_item.non_tensor_batch["labels"] = processed_label + processed_item.meta_info["processing_step"] = 1 + + processed_items.append(processed_item) + + print(f"Processed item {i}: value={processed_value.item()}, label='{processed_label}'") + + # Merge processed items back + processed_proto = DataProto.from_items(processed_items) + + print(f"\nProcessed values: {processed_proto.batch['values'].flatten()}") + print(f"Processed labels: {processed_proto.non_tensor_batch['labels']}") + print(f"Processing step: {processed_proto.meta_info['processing_step']}") + + +def example_convenience_methods(): + """Example showing convenience methods.""" + print("\n=== Convenience Methods Example ===") + + # Create a single DataProtoItem + single_tensor = torch.tensor([42]).unsqueeze(0) # Shape: (1,) + single_item = DataProtoItem( + batch=None, # We'll create TensorDict manually + non_tensor_batch={"text": "Hello"}, + meta_info={"source": "manual"}, + ) + + # Create TensorDict manually for the single item + from tensordict import TensorDict + + single_item.batch = TensorDict({"data": single_tensor}, batch_size=(1,)) + + print(f"Single item data: {single_item.batch['data']}") + print(f"Single item text: {single_item.non_tensor_batch['text']}") + + # Convert single item to DataProto using convenience method + single_proto = single_item.to_proto() + print(f"Converted to DataProto length: {len(single_proto)}") + + # Create multiple items and use static convenience method + items = [single_item] + for i in range(2): + new_item = single_item.copy() # Use the copy method + new_item.batch["data"] = torch.tensor([100 + i]).unsqueeze(0) + new_item.non_tensor_batch["text"] = f"Item {i + 1}" + items.append(new_item) + + # Use DataProtoItem.from_items() convenience method + merged_proto = DataProtoItem.from_items(items) + print(f"Merged using convenience method - length: {len(merged_proto)}") + print(f"Data: {merged_proto.batch['data'].flatten()}") + print(f"Texts: {merged_proto.non_tensor_batch['text']}") + + +def example_error_handling(): + """Example showing error handling.""" + print("\n=== Error Handling Example ===") + + # Try to create DataProto from empty list try: - # Run all tests - test_basic_split_and_merge() - test_individual_item_access() - test_partial_merge() - test_item_processing() - test_error_conditions() - test_roundtrip_integrity() - - # Run visual comparison - visual_success = run_visual_comparison() - - if visual_success: - print("\n" + "=" * 60) - print("🎉 ALL TESTS PASSED!") - print("DataProto split/merge functionality is working correctly.") - else: - print("\n" + "=" * 60) - print("❌ SOME TESTS FAILED!") + DataProto.from_items([]) + print("ERROR: Should have raised exception for empty list") + except ValueError as e: + print(f"✓ Correctly caught error for empty list: {e}") - except Exception as e: - print(f"\n❌ Test failed with exception: {e}") - import traceback + # Try to merge items with inconsistent structure + try: + item1 = DataProtoItem( + batch=TensorDict({"data": torch.tensor([1]).unsqueeze(0)}, batch_size=(1,)), + non_tensor_batch={"text": "Hello"}, + ) + item2 = DataProtoItem( + batch=TensorDict({"different_key": torch.tensor([2]).unsqueeze(0)}, batch_size=(1,)), + non_tensor_batch={"text": "World"}, + ) - traceback.print_exc() + DataProto.from_items([item1, item2]) + print("ERROR: Should have raised exception for inconsistent structure") + except ValueError as e: + print(f"✓ Correctly caught error for inconsistent structure: {e}") + + +if __name__ == "__main__": + # Run all tests + test_basic_split_and_merge() + test_individual_item_access() + test_partial_merge() + test_item_processing() + test_error_conditions() + test_roundtrip_integrity() + example_basic_split_merge() + example_item_processing() + example_convenience_methods() + example_error_handling() + run_visual_comparison() From 1cfebfe1bb4c2a170fd46be83e27cf275aaa2566 Mon Sep 17 00:00:00 2001 From: ArronHZG Date: Tue, 12 Aug 2025 14:14:52 +0800 Subject: [PATCH 035/182] english notes --- .../config/fully_async_ppo_trainer.yaml | 7 -- recipe/fully_async_policy/fully_async_main.py | 13 ++- .../fully_async_rollouter.py | 51 +++++------- .../fully_async_policy/fully_async_trainer.py | 4 +- recipe/fully_async_policy/message_queue.py | 79 +++++++++---------- 5 files changed, 63 insertions(+), 91 deletions(-) diff --git a/recipe/fully_async_policy/config/fully_async_ppo_trainer.yaml b/recipe/fully_async_policy/config/fully_async_ppo_trainer.yaml index a5f58fadc2f..665f7a8be89 100644 --- a/recipe/fully_async_policy/config/fully_async_ppo_trainer.yaml +++ b/recipe/fully_async_policy/config/fully_async_ppo_trainer.yaml @@ -12,17 +12,10 @@ async_training: # 新鲜度控制 (Freshness Control) staleness_threshold: 3 # 样本新鲜度阈值 - # 参数同步 (Parameter Synchronization) - max_sync_retries: 3 # 参数同步最大重试次数 - sync_timeout: 30.0 # 同步超时时间(秒) - sync_retry_delay: 1.0 # 重试延迟时间(秒) - # Rollout配置 rollout: nnodes: 1 # Number of nodes used in the rollout n_gpus_per_node: 8 # Number of GPUs per node - mode: async # rollout模式: sync, async - name: vllm # rollout引擎: vllm, sglang n: 4 # 每个prompt生成的响应数量 total_rollout_steps: 100 total_epochs: 10 diff --git a/recipe/fully_async_policy/fully_async_main.py b/recipe/fully_async_policy/fully_async_main.py index 6afb44abd9d..39eacb86314 100644 --- a/recipe/fully_async_policy/fully_async_main.py +++ b/recipe/fully_async_policy/fully_async_main.py @@ -40,11 +40,10 @@ def create_resource_pool_manager(config, roles: list) -> ResourcePoolManager: Returns: ResourcePoolManager: Resource pool manager """ - # 构建资源池规格 resource_pool_spec = {} mapping = {} - # Actor/Critic资源池(训练相关) + # Actor/Critic resource pool if any(role in roles for role in [Role.Actor, Role.Critic, Role.RefPolicy, Role.RewardModel]): assert config.trainer.n_gpus_per_node > 0, "config.trainer.n_gpus_per_node must be greater than 0" assert config.trainer.nnodes > 0, "config.trainer.nnodes must be greater than 0" @@ -52,12 +51,12 @@ def create_resource_pool_manager(config, roles: list) -> ResourcePoolManager: trainer_pool = [config.trainer.n_gpus_per_node] * config.trainer.nnodes resource_pool_spec["trainer_pool"] = trainer_pool - # 训练相关角色映射到同一个资源池 + # Map training-related roles to the same resource pool for role in [Role.Actor, Role.Critic, Role.RefPolicy, Role.RewardModel]: if role in roles: mapping[role] = "trainer_pool" - # Rollout资源池 + # Rollout resource pool if Role.Rollout in roles: assert config.rollout.n_gpus_per_node > 0, "config.rollout.n_gpus_per_node must be greater than 0" assert config.rollout.nnodes > 0, "config.rollout.nnodes must be greater than 0" @@ -79,7 +78,7 @@ def create_role_worker_mapping(config): Returns: dict: Mapping from roles to worker classes """ - # 根据策略选择worker类 + # Select worker class based on strategy if config.actor_rollout_ref.actor.strategy == "fsdp2": assert config.actor_rollout_ref.actor.strategy == config.critic.strategy from recipe.one_step_off_policy.fsdp_workers import ( @@ -148,7 +147,6 @@ def __init__(self): self.shutdown_event = threading.Event() def run(self, config): - """运行完全异步的PPO训练""" print("Starting fully async PPO training...") self._initialize_components(config) self._run_training_loop() @@ -172,7 +170,7 @@ def _initialize_components(self, config) -> None: self.components["tokenizer"] = tokenizer self.components["processor"] = processor - self.components["config"] = config # 保存config以供其他方法使用 + self.components["config"] = config print("Creating worker mapping and resource pools...") role_worker_mapping, ray_worker_group_cls = create_role_worker_mapping(config) @@ -278,7 +276,6 @@ def _run_training_loop(self): ray.get(trainer_future) self.components["message_queue_client"].clear_queue() - print("Training completed or interrupted") diff --git a/recipe/fully_async_policy/fully_async_rollouter.py b/recipe/fully_async_policy/fully_async_rollouter.py index 01affa67586..d2abf3dab2f 100644 --- a/recipe/fully_async_policy/fully_async_rollouter.py +++ b/recipe/fully_async_policy/fully_async_rollouter.py @@ -72,7 +72,6 @@ def __init__( self.use_reference_policy = False self.use_rm = False - # Create datasets print("Creating datasets...") from verl.trainer.main_ppo import create_rl_dataset, create_rl_sampler from verl.utils.dataset.rl_dataset import collate_fn @@ -186,7 +185,6 @@ def _create_continuous_iterator(self): yield epoch, batch_dict def fit(self): - """开始异步生成样本 - 改进的主运行逻辑""" print("Starting FullyAsyncRollouter...") if self.message_queue_client is None: @@ -199,15 +197,12 @@ def fit(self): self.running = True self.paused = False - # 创建并启动生成线程 self.generation_thread = threading.Thread(target=self._generation_loop, daemon=True) self.generation_thread.start() - # 创建并启动监控线程 self.monitor_thread = threading.Thread(target=self._monitor_loop, daemon=True) self.monitor_thread.start() - # 等待线程完成 self.generation_thread.join() self.monitor_thread.join() @@ -215,16 +210,17 @@ def fit(self): def _generation_loop(self): """ - 主要的生成循环 - 循环入口,需要 - 1. running 判断 - 4. 中断判断 - 3. 新鲜度判断 + Main Generation Loop - 生成样本过程中,需要 - 1. running 判断 - 2. 中断判断 + Loop Entry Requirements: + 1. Running status validation + 2. Interruption detection + 3. Freshness validation + + During Sample Generation Process: + 1. Running status validation + 2. Interruption detection """ from verl.utils.tracking import Tracking @@ -265,12 +261,10 @@ def _generation_loop(self): if self._should_pause_generation(): self.pause() - # 如果被暂停,等待恢复 while self.paused and self.running: print("Generation thread paused, waiting...") self.condition.wait() - # 再次检查运行状态 if not self.running: break @@ -292,7 +286,7 @@ def _generation_loop(self): gen_batch_output.meta_info.pop("timing", None) if gen_batch_output is not None: - # 准备rollout metadata + # prepare rollout metadata rollout_metadata = { "timing": timing_raw, "generation_timestamp": time.time(), @@ -306,7 +300,6 @@ def _generation_loop(self): data=sample, rollout_metadata=rollout_metadata, ) - # 放入队列 success = self.message_queue_client.put_sample( sample=ray.cloudpickle.dumps(queue_sample), param_version=self.current_param_version, @@ -341,11 +334,9 @@ def _generation_loop(self): ) def _monitor_loop(self): - """监控线程 - 监控状态并处理控制信号""" - # 主线程保持运行,处理控制信号和状态监控 last_stats_time = time.time() - stats_interval = 30.0 # 30秒报告一次统计 - check_interval = 5.0 # 5秒检查一次状态 + stats_interval = 30.0 + check_interval = 5.0 while True: with self.lock: if not self.running: @@ -356,7 +347,6 @@ def _monitor_loop(self): if current_time - last_stats_time >= stats_interval: print(self.get_statistics()) last_stats_time = current_time - # 检查是否应该恢复生成 if not self._should_pause_generation(): with self.lock: if self.paused: @@ -365,18 +355,14 @@ def _monitor_loop(self): print("Generation resumed") def _should_pause_generation(self) -> bool: - """ - 判断是否应该暂停生成,基于新鲜度控制 - 改进的判断逻辑 - """ + """Determine whether the build should be paused""" try: queue_stats = self.message_queue_client.get_statistics() queue_size = queue_stats["queue_size"] current_trainer_version = queue_stats["current_param_version"] - # 计算参数版本差异 version_diff = self.current_param_version - current_trainer_version - # 如果版本差异过大,暂停生成 if version_diff >= self.staleness_threshold: print( f"Should pause due to staleness: rollout_version={self.current_param_version}, " @@ -384,7 +370,6 @@ def _should_pause_generation(self) -> bool: ) return True - # 如果队列太满,也暂停生成 if queue_size >= self.max_queue_size: print(f"Should pause due to full queue: size={queue_size}, max={self.max_queue_size}") return True @@ -393,11 +378,11 @@ def _should_pause_generation(self) -> bool: except Exception as e: print(f"Error checking pause conditions: {e}") - return True # 出错时暂停生成 + return True def pause(self) -> bool: - """暂停生成 - TODO 集成 Partial Rollout + """ pause rollout + TODO integrated Partial Rollout """ print("[rollouter] pause") with self.lock: @@ -411,8 +396,8 @@ def pause(self) -> bool: return True def resume(self) -> bool: - """恢复生成 - TODO 集成 Partial Rollout + """ resume rollout + TODO integrated Partial Rollout """ print("[rollouter] resume") with self.lock: diff --git a/recipe/fully_async_policy/fully_async_trainer.py b/recipe/fully_async_policy/fully_async_trainer.py index 39fedb022d5..bbc5cfa75d0 100644 --- a/recipe/fully_async_policy/fully_async_trainer.py +++ b/recipe/fully_async_policy/fully_async_trainer.py @@ -39,8 +39,8 @@ @ray.remote class FullyAsyncTrainer(RayPPOTrainer): """ - 完全异步的PPO训练器,从MessageQueue获取样本进行训练 - 基于OneStepOffRayTrainer的成熟实现改进 + A fully asynchronous PPO trainer that obtains samples from a MessageQueue for training. + Based on an improved implementation of OneStepOffRayTrainer """ def __init__( diff --git a/recipe/fully_async_policy/message_queue.py b/recipe/fully_async_policy/message_queue.py index ad261b0072a..c4c7d85f5a7 100644 --- a/recipe/fully_async_policy/message_queue.py +++ b/recipe/fully_async_policy/message_queue.py @@ -26,8 +26,6 @@ @dataclass class QueueSample: - """单个batch样本,包含参数版本和新鲜度信息""" - data: Any rollout_metadata: dict[str, Any] @@ -35,7 +33,7 @@ class QueueSample: @ray.remote(num_cpus=10, max_concurrency=10) class MessageQueue: """ - 简化的Ray-based异步消息队列,用于Rollouter和Trainer之间的通信 + Simplified Ray-based asynchronous message queue for communication between Rollouter and Trainer """ def __init__(self, config: DictConfig, max_queue_size: int = 1000): @@ -44,7 +42,6 @@ def __init__(self, config: DictConfig, max_queue_size: int = 1000): self.queue = deque(maxlen=max_queue_size) self.current_param_version = 0 - # 安全地获取配置值 try: if hasattr(config, "async_training") and config.async_training is not None: self.staleness_threshold = getattr(config.async_training, "staleness_threshold", 3) @@ -56,40 +53,40 @@ def __init__(self, config: DictConfig, max_queue_size: int = 1000): # Threading for message handling self.running = True - # 线程安全 + # thread safe self.lock = threading.RLock() self.consumer_condition = threading.Condition(self.lock) - # 统计信息 + # statistic message self.total_produced = 0 self.total_consumed = 0 self.dropped_samples = 0 logger.info( f"MessageQueue initialized with max_queue_size={max_queue_size}," - "staleness_threshold={self.staleness_threshold}" + f"staleness_threshold={self.staleness_threshold}" ) def put_sample(self, sample: Any, param_version: int) -> bool: """ - 放入一个batch样本到队列 + Put a batch sample into the queue Args: - sample: 样本数据 - param_version: 参数版本号 + sample: Sample data + param_version: Parameter version number Returns: - bool: 是否成功放入队列 + bool: Whether the sample was successfully put into the queue """ with self.lock: - # 检查新鲜度 + # Check freshness staleness = self.current_param_version - param_version if staleness >= self.staleness_threshold: self.dropped_samples += 1 logger.debug(f"Dropped stale sample: staleness={staleness}, threshold={self.staleness_threshold}") return False - # 如果队列满了,移除最旧的样本,一般不会发生 + # If queue is full, remove the oldest sample (rarely happens) if len(self.queue) >= self.max_queue_size: removed = self.queue.popleft() self.dropped_samples += 1 @@ -97,7 +94,7 @@ def put_sample(self, sample: Any, param_version: int) -> bool: self.queue.append(sample) self.total_produced += 1 - # 通知等待的消费者 + # Notify waiting consumers self.consumer_condition.notify() if self.total_produced % 100 == 0: @@ -107,13 +104,13 @@ def put_sample(self, sample: Any, param_version: int) -> bool: def get_samples(self, min_batch_count: int = 1) -> list[Any]: """ - 从队列获取batch样本,一直等待直到有足够样本 + Get batch samples from the queue, wait until enough samples are available Args: - min_batch_count: sample数量满足min_batch,一次性获取 + min_batch_count: Get samples at once when sample count meets min_batch Returns: - List[Any]: 获取的样本列表 + List[Any]: List of retrieved samples """ print("get_samples") @@ -125,11 +122,11 @@ def get_samples(self, min_batch_count: int = 1) -> list[Any]: return [] self.consumer_condition.wait() - # 如果队列已关闭且没有足够样本,返回空列表 + # If queue is closed and doesn't have enough samples, return empty list if not self.running and len(self.queue) < min_batch_count: return [] - # 获取指定数量的样本 + # Get specified number of samples batch_count = min(min_batch_count, len(self.queue)) samples = [] for _ in range(batch_count): @@ -144,19 +141,19 @@ def get_samples(self, min_batch_count: int = 1) -> list[Any]: return samples def update_param_version(self, version: int): - """更新当前参数版本""" + """Update current parameter version""" with self.lock: old_version = self.current_param_version self.current_param_version = version logger.debug(f"Parameter version updated from {old_version} to {version}") def get_queue_size(self) -> int: - """获取当前队列长度""" + """Get current queue length""" with self.lock: return len(self.queue) def get_statistics(self) -> dict[str, Any]: - """获取队列统计信息""" + """Get queue statistics""" with self.lock: return { "queue_size": len(self.queue), @@ -169,41 +166,41 @@ def get_statistics(self) -> dict[str, Any]: } def clear_queue(self): - """清空队列""" + """Clear the queue""" with self.lock: cleared_count = len(self.queue) self.queue.clear() logger.info(f"Cleared {cleared_count} samples from queue") def shutdown(self): - """关闭消息队列""" + """Shutdown the message queue""" with self.lock: self.running = False - # 通知所有等待的线程,让它们能够退出 + # Notify all waiting threads so they can exit self.consumer_condition.notify_all() logger.info("MessageQueue shutdown") def get_memory_usage(self) -> dict: - """获取内存使用统计""" + """Get memory usage statistics""" with self.lock: - # 估算队列中样本的内存使用 + # Estimate memory usage of samples in queue import sys total_size = 0 sample_count = len(self.queue) if sample_count > 0: - # 估算单个样本的大小(简化估算) + # Estimate size of a single sample (simplified estimation) sample = list(self.queue)[0] try: sample_size = sys.getsizeof(sample) if hasattr(sample.data, "batch") and hasattr(sample.data.batch, "__len__"): - # 如果有batch信息,估算数据大小 + # If batch info is available, estimate data size batch_size = len(sample.data.batch) - sample_size += batch_size * 1000 # 粗略估算每个batch条目1KB + sample_size += batch_size * 1000 # Roughly estimate 1KB per batch entry total_size = sample_size * sample_count except Exception: - total_size = sample_count * 10000 # 粗略估算每个样本10KB + total_size = sample_count * 10000 # Roughly estimate 10KB per sample return { "queue_samples": sample_count, @@ -213,39 +210,39 @@ def get_memory_usage(self) -> dict: class MessageQueueClient: - """MessageQueue的客户端,用于与MessageQueue Actor通信""" + """MessageQueue client for communicating with MessageQueue Actor""" def __init__(self, queue_actor: Any): self.queue_actor = queue_actor def put_sample(self, sample: Any, param_version: int) -> bool: - """放入batch到队列""" + """Put batch into queue""" return ray.get(self.queue_actor.put_sample.remote(sample, param_version)) def get_samples(self, min_batch_count: int = 1) -> list[Any]: - """从队列获取batch,一直等待直到有足够样本""" + """Get batch from queue, wait until enough samples are available""" return ray.get(self.queue_actor.get_samples.remote(min_batch_count)) def update_param_version(self, version: int): - """更新参数版本""" + """Update parameter version""" ray.get(self.queue_actor.update_param_version.remote(version)) def get_queue_size(self) -> int: - """获取队列大小""" + """Get queue size""" return ray.get(self.queue_actor.get_queue_size.remote()) def get_statistics(self) -> dict[str, Any]: - """获取统计信息""" + """Get statistics""" return ray.get(self.queue_actor.get_statistics.remote()) def clear_queue(self): - """清空队列""" + """Clear queue""" ray.get(self.queue_actor.clear_queue.remote()) def shutdown(self): - """关闭队列""" + """Shutdown queue""" ray.get(self.queue_actor.shutdown.remote()) def get_memory_usage(self) -> dict: - """获取内存使用统计""" - return ray.get(self.queue_actor.get_memory_usage.remote()) + """Get memory usage statistics""" + return ray.get(self.queue_actor.get_memory_usage.remote()) \ No newline at end of file From 5d108bfe48b0083f3966b3f450b8cc655a1e3fb5 Mon Sep 17 00:00:00 2001 From: ArronHZG Date: Tue, 12 Aug 2025 14:15:12 +0800 Subject: [PATCH 036/182] english notes --- recipe/fully_async_policy/fully_async_rollouter.py | 4 ++-- recipe/fully_async_policy/message_queue.py | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/recipe/fully_async_policy/fully_async_rollouter.py b/recipe/fully_async_policy/fully_async_rollouter.py index d2abf3dab2f..b0b270ba685 100644 --- a/recipe/fully_async_policy/fully_async_rollouter.py +++ b/recipe/fully_async_policy/fully_async_rollouter.py @@ -381,7 +381,7 @@ def _should_pause_generation(self) -> bool: return True def pause(self) -> bool: - """ pause rollout + """pause rollout TODO integrated Partial Rollout """ print("[rollouter] pause") @@ -396,7 +396,7 @@ def pause(self) -> bool: return True def resume(self) -> bool: - """ resume rollout + """resume rollout TODO integrated Partial Rollout """ print("[rollouter] resume") diff --git a/recipe/fully_async_policy/message_queue.py b/recipe/fully_async_policy/message_queue.py index c4c7d85f5a7..3efe982752d 100644 --- a/recipe/fully_async_policy/message_queue.py +++ b/recipe/fully_async_policy/message_queue.py @@ -245,4 +245,4 @@ def shutdown(self): def get_memory_usage(self) -> dict: """Get memory usage statistics""" - return ray.get(self.queue_actor.get_memory_usage.remote()) \ No newline at end of file + return ray.get(self.queue_actor.get_memory_usage.remote()) From 796880ea3e21364cdc1622200a9ca6164f786013 Mon Sep 17 00:00:00 2001 From: ArronHZG Date: Tue, 12 Aug 2025 15:36:46 +0800 Subject: [PATCH 037/182] update print --- recipe/fully_async_policy/fully_async_main.py | 38 ++++--- .../fully_async_rollouter.py | 101 +++++++++--------- .../fully_async_policy/fully_async_trainer.py | 61 ++++------- recipe/fully_async_policy/message_queue.py | 10 +- recipe/fully_async_policy/param_sync.py | 6 +- verl/trainer/ppo/ray_trainer.py | 8 -- 6 files changed, 98 insertions(+), 126 deletions(-) diff --git a/recipe/fully_async_policy/fully_async_main.py b/recipe/fully_async_policy/fully_async_main.py index 39eacb86314..163b2420381 100644 --- a/recipe/fully_async_policy/fully_async_main.py +++ b/recipe/fully_async_policy/fully_async_main.py @@ -147,16 +147,16 @@ def __init__(self): self.shutdown_event = threading.Event() def run(self, config): - print("Starting fully async PPO training...") + print("[ASYNC MAIN] Starting fully async PPO training...") self._initialize_components(config) self._run_training_loop() def _initialize_components(self, config) -> None: - print(f"TaskRunner hostname: {socket.gethostname()}, PID: {os.getpid()}") + print(f"[ASYNC MAIN] TaskRunner hostname: {socket.gethostname()}, PID: {os.getpid()}") pprint(OmegaConf.to_container(config, resolve=True)) OmegaConf.resolve(config) - print("Initializing model and tokenizer...") + print("[ASYNC MAIN] Initializing model and tokenizer...") local_path = copy_to_local( config.actor_rollout_ref.model.path, use_shm=config.actor_rollout_ref.model.get("use_shm", False) ) @@ -172,12 +172,12 @@ def _initialize_components(self, config) -> None: self.components["processor"] = processor self.components["config"] = config - print("Creating worker mapping and resource pools...") + print("[ASYNC MAIN] Creating worker mapping and resource pools...") role_worker_mapping, ray_worker_group_cls = create_role_worker_mapping(config) self.components["role_worker_mapping"] = role_worker_mapping self.components["ray_worker_group_cls"] = ray_worker_group_cls - print("Loading reward functions...") + print("[ASYNC MAIN] Loading reward functions...") reward_fn = load_reward_manager( config, tokenizer, num_examine=0, **config.reward_model.get("reward_kwargs", {}) ) @@ -187,25 +187,24 @@ def _initialize_components(self, config) -> None: self.components["reward_fn"] = reward_fn self.components["val_reward_fn"] = val_reward_fn - self.max_queue_size = ( - config.async_training.staleness_threshold - * config.data.train_batch_size - * config.actor_rollout_ref.rollout.n - ) * 10 # x 10 avoid deadlock - print("Creating MessageQueue...") + self.max_queue_size = ((config.async_training.staleness_threshold + 1) + * config.data.train_batch_size + * config.actor_rollout_ref.rollout.n + ) * 10 # x 10 avoid deadlock + print("[ASYNC MAIN] Creating MessageQueue...") message_queue = MessageQueue.remote(config, self.max_queue_size) message_queue_client = MessageQueueClient(message_queue) self.components["message_queue"] = message_queue self.components["message_queue_client"] = message_queue_client - print("Creating FullyAsyncRollouter...") + print("[ASYNC MAIN] Creating FullyAsyncRollouter...") self._create_rollouter(config) - print("Creating FullyAsyncTrainer...") + print("[ASYNC MAIN] Creating FullyAsyncTrainer...") self._create_trainer(config) - print("Setting up parameter synchronization...") + print("[ASYNC MAIN] Setting up parameter synchronization...") from recipe.fully_async_policy.param_sync import ParameterSynchronizer param_synchronizer = ParameterSynchronizer.remote( @@ -221,10 +220,9 @@ def _initialize_components(self, config) -> None: ray.get(param_synchronizer.sync_weights.remote(0)) self.components["param_synchronizer"] = param_synchronizer - print("All components initialized successfully") + print("[ASYNC MAIN] All components initialized successfully") def _create_rollouter(self, config) -> None: - pprint(self.components) rollouter = FullyAsyncRollouter.remote( config=config, tokenizer=self.components["tokenizer"], @@ -239,7 +237,7 @@ def _create_rollouter(self, config) -> None: ray.get(rollouter.init_workers.remote()) ray.get(rollouter.set_message_queue_client.remote(self.components["message_queue_client"])) self.components["rollouter"] = rollouter - print("Rollouter created and initialized successfully") + print("[ASYNC MAIN] Rollouter created and initialized successfully") def _create_trainer(self, config) -> None: trainer_role_mapping = { @@ -263,12 +261,12 @@ def _create_trainer(self, config) -> None: ray.get(trainer.init_workers.remote()) ray.get(trainer.set_message_queue_client.remote(self.components["message_queue_client"])) self.components["trainer"] = trainer - print("FullyAsyncTrainer created and initialized successfully") + print("[ASYNC MAIN] FullyAsyncTrainer created and initialized successfully") def _run_training_loop(self): self.running = True - print("Starting Rollouter in background...") + print("[ASYNC MAIN] Starting Rollouter in background...") rollouter_future = self.components["rollouter"].fit.remote() trainer_future = self.components["trainer"].fit.remote() @@ -276,7 +274,7 @@ def _run_training_loop(self): ray.get(trainer_future) self.components["message_queue_client"].clear_queue() - print("Training completed or interrupted") + print("[ASYNC MAIN] Training completed or interrupted") @hydra.main(config_path="config", config_name="fully_async_ppo_trainer", version_base=None) diff --git a/recipe/fully_async_policy/fully_async_rollouter.py b/recipe/fully_async_policy/fully_async_rollouter.py index b0b270ba685..2d9d839feca 100644 --- a/recipe/fully_async_policy/fully_async_rollouter.py +++ b/recipe/fully_async_policy/fully_async_rollouter.py @@ -35,17 +35,17 @@ class FullyAsyncRollouter(RayPPOTrainer): """ def __init__( - self, - config, - tokenizer, - role_worker_mapping: dict[Role, WorkerType], - resource_pool_manager: ResourcePoolManager, - ray_worker_group_cls: RayWorkerGroup = RayWorkerGroup, - processor=None, - reward_fn=None, - val_reward_fn=None, - device_name=None, - max_queue_size=1000, + self, + config, + tokenizer, + role_worker_mapping: dict[Role, WorkerType], + resource_pool_manager: ResourcePoolManager, + ray_worker_group_cls: RayWorkerGroup = RayWorkerGroup, + processor=None, + reward_fn=None, + val_reward_fn=None, + device_name=None, + max_queue_size=1000, ): # Store the tokenizer for text processing self.tokenizer = tokenizer @@ -72,7 +72,7 @@ def __init__( self.use_reference_policy = False self.use_rm = False - print("Creating datasets...") + print(f"[ROLLOUTER] Creating datasets...") from verl.trainer.main_ppo import create_rl_dataset, create_rl_sampler from verl.utils.dataset.rl_dataset import collate_fn @@ -81,7 +81,7 @@ def __init__( train_sampler = create_rl_sampler(config.data, train_dataset) self._validate_config() - pprint(f"Rollouter _create_dataloader...\n{train_dataset}\n{val_dataset}") + print(f"[ROLLOUTER] Rollouter _create_dataloader...\n{train_dataset}\n{val_dataset}") self._create_dataloader(train_dataset, val_dataset, collate_fn, train_sampler) total_rollout_steps = len(self.train_dataloader) * self.config.trainer.total_epochs @@ -90,7 +90,7 @@ def __init__( total_rollout_steps = self.config.rollout.total_rollout_steps self.total_rollout_steps = total_rollout_steps - print(f"Total rollout steps: {self.total_rollout_steps}") + print(f"[ROLLOUTER] Total rollout steps: {self.total_rollout_steps}") # Rollouter parameter configuration self.message_queue_client = None @@ -103,8 +103,14 @@ def __init__( # Statistics self.total_generated_samples = 0 + self.train_step_samples = 0 self.dropped_stale_samples = 0 - self.param_sync_requests = 0 + + # Calculate the samples needed for a train, used to calculate staleness and interrupt rollout + n_responses_per_prompt = self.config.actor_rollout_ref.rollout.n + batch_size = self.config.data.train_batch_size + required_samples = n_responses_per_prompt * batch_size + self.max_required_samples = required_samples * (self.staleness_threshold + 1) # Worker groups self.rollout_wg = None @@ -120,17 +126,13 @@ def __init__( self.condition = threading.Condition(self.lock) # Pause/resume statistics - self.pause_count = 0 - self.resume_count = 0 self.total_pause_time = 0.0 self.last_pause_time = None # Parameter synchronization related self.param_synchronizer = None - self.last_sync_time = 0 - self.sync_in_progress = False - self.sync_lock = threading.Lock() + # queue size self.max_queue_size = max_queue_size def set_message_queue_client(self, message_queue_client: MessageQueueClient): @@ -152,12 +154,14 @@ def update_param_version(self, version: int): with self.lock: old_version = self.current_param_version self.current_param_version = version - print(f"Parameter version updated from {old_version} to {version}") + # every time param change, reset train_step_samples + self.train_step_samples = 0 + print(f"[ROLLOUTER] Parameter version updated from {old_version} to {version}") def _validate_config(self): # Validate asynchronous training configuration if not hasattr(self.config, "async_training"): - raise ValueError("Missing async_training configuration") + raise ValueError("[ROLLOUTER] Missing async_training configuration") def _create_actor_rollout_classes(self): # only create rollout @@ -185,7 +189,7 @@ def _create_continuous_iterator(self): yield epoch, batch_dict def fit(self): - print("Starting FullyAsyncRollouter...") + print(f"[ROLLOUTER] Starting FullyAsyncRollouter...") if self.message_queue_client is None: raise ValueError("MessageQueue client not set. Call set_message_queue_client() first.") @@ -206,7 +210,7 @@ def fit(self): self.generation_thread.join() self.monitor_thread.join() - print("Rollouter fit completed") + print(f"[ROLLOUTER] Rollouter fit completed") def _generation_loop(self): """ @@ -217,6 +221,7 @@ def _generation_loop(self): 1. Running status validation 2. Interruption detection 3. Freshness validation + 4. train_step_samples validation During Sample Generation Process: 1. Running status validation @@ -242,7 +247,7 @@ def _generation_loop(self): if self.val_reward_fn is not None and self.config.trainer.get("val_before_train", True): val_metrics = self._validate() assert val_metrics, f"{val_metrics=}" - pprint(f"Initial validation metrics: {val_metrics}") + pprint(f"[ROLLOUTER] Initial validation metrics: {val_metrics}") self.logger.log(data=val_metrics, step=self.global_steps) if self.config.trainer.get("val_only", False): return @@ -262,7 +267,7 @@ def _generation_loop(self): self.pause() while self.paused and self.running: - print("Generation thread paused, waiting...") + print(f"[ROLLOUTER] Generation thread paused, waiting...") self.condition.wait() if not self.running: @@ -304,24 +309,17 @@ def _generation_loop(self): sample=ray.cloudpickle.dumps(queue_sample), param_version=self.current_param_version, ) - print(f"put samples {success}") with self.lock: if success: self.total_generated_samples += 1 + self.train_step_samples += 1 else: self.dropped_stale_samples += 1 - if self.global_steps % 1 == 0: - print( - f"Generated {self.total_generated_samples} batches, \n" - f"param_version={self.current_param_version}, \n" - f"Dropped stale samples: {self.dropped_stale_samples}\n" - ) - self.global_steps += 1 if is_last_step: - pprint(f"Final validation metrics: {last_val_metrics}") + pprint(f"[ROLLOUTER] Final validation metrics: {last_val_metrics}") break with self.lock: @@ -345,14 +343,14 @@ def _monitor_loop(self): # 定期打印统计信息 current_time = time.time() if current_time - last_stats_time >= stats_interval: - print(self.get_statistics()) + print(f"[ROLLOUTER] {self.get_statistics()}") last_stats_time = current_time if not self._should_pause_generation(): with self.lock: if self.paused: self.paused = False self.condition.notify_all() - print("Generation resumed") + print(f"[ROLLOUTER] Generation resumed") def _should_pause_generation(self) -> bool: """Determine whether the build should be paused""" @@ -363,28 +361,35 @@ def _should_pause_generation(self) -> bool: version_diff = self.current_param_version - current_trainer_version - if version_diff >= self.staleness_threshold: + if version_diff > self.staleness_threshold: print( - f"Should pause due to staleness: rollout_version={self.current_param_version}, " + "[ROLLOUTER] " + f"Should pause due to version_diff > self.staleness_threshold: " + f"rollout_version={self.current_param_version}, " f"trainer_version={current_trainer_version}, diff={version_diff}" ) return True if queue_size >= self.max_queue_size: - print(f"Should pause due to full queue: size={queue_size}, max={self.max_queue_size}") + print(f"[ROLLOUTER] Should pause due to full queue: size={queue_size}, max={self.max_queue_size}") + return True + + if self.train_step_samples >= self.max_required_samples: + print(f"[ROLLOUTER] Should pause due to step_generated_samples >= max_required_samples: " + f"self.step_generated_samples={self.train_step_samples}, max={self.max_required_samples}") return True return False except Exception as e: - print(f"Error checking pause conditions: {e}") + print(f"[ROLLOUTER] Error checking pause conditions: {e}") return True def pause(self) -> bool: """pause rollout TODO integrated Partial Rollout """ - print("[rollouter] pause") + print(f"[ROLLOUTER] pause") with self.lock: if not self.running: return False @@ -399,7 +404,7 @@ def resume(self) -> bool: """resume rollout TODO integrated Partial Rollout """ - print("[rollouter] resume") + print(f"[ROLLOUTER] resume") with self.lock: if not self.running: return False @@ -409,20 +414,18 @@ def resume(self) -> bool: self.paused = False self.condition.notify_all() - print("Generation resumed") return True def get_statistics(self) -> dict: with self.lock: queue_stats = self.message_queue_client.get_statistics() stats = { + "is_running": self.running, "total_generated_samples": self.total_generated_samples, + "train_step_samples": self.train_step_samples, "dropped_stale_samples": self.dropped_stale_samples, "current_param_version": self.current_param_version, - "param_sync_requests": self.param_sync_requests, - "last_sync_time": self.last_sync_time, - "is_running": self.running, - "sync_in_progress": self.sync_in_progress, - "queue_size": f"{queue_stats['queue_size']}", + "queue_size": queue_stats['queue_size'], + "queue_max_size": self.max_queue_size, } return stats diff --git a/recipe/fully_async_policy/fully_async_trainer.py b/recipe/fully_async_policy/fully_async_trainer.py index bbc5cfa75d0..418ab024d0a 100644 --- a/recipe/fully_async_policy/fully_async_trainer.py +++ b/recipe/fully_async_policy/fully_async_trainer.py @@ -44,16 +44,16 @@ class FullyAsyncTrainer(RayPPOTrainer): """ def __init__( - self, - config, - tokenizer, - role_worker_mapping: dict[Role, WorkerType], - resource_pool_manager: ResourcePoolManager, - ray_worker_group_cls: RayWorkerGroup = RayWorkerGroup, - processor=None, - reward_fn=None, - val_reward_fn=None, - device_name=None, + self, + config, + tokenizer, + role_worker_mapping: dict[Role, WorkerType], + resource_pool_manager: ResourcePoolManager, + ray_worker_group_cls: RayWorkerGroup = RayWorkerGroup, + processor=None, + reward_fn=None, + val_reward_fn=None, + device_name=None, ): # Store the tokenizer for text processing self.tokenizer = tokenizer @@ -128,6 +128,7 @@ def _get_samples_from_queue(self) -> tuple[None, None] | tuple[int, Any]: required_samples = n_responses_per_prompt * batch_size print( + "[FullyAsyncTrainer] " f"Requesting {required_samples} samples from queue (n={n_responses_per_prompt}, batch_size={batch_size})", flush=True, ) @@ -141,17 +142,13 @@ def _get_samples_from_queue(self) -> tuple[None, None] | tuple[int, Any]: logger.warning("required_samples is empty") return None, None - print(f"Retrieved {len(queue_samples)} samples from queue. wait time {consumer_end - consumer_start}") + print(f"[FullyAsyncTrainer] Retrieved {len(queue_samples)} samples from queue." + f"wait time {consumer_end - consumer_start:.2f} seconds.") queue_samples = [ray.cloudpickle.loads(x) for x in queue_samples] - print(queue_samples) - # Assemble batch batch = self._assemble_gen_batch_output_from_queue_samples(queue_samples) - print("=" * 200) - print(batch) - return 0, batch def _assemble_gen_batch_output_from_queue_samples(self, queue_samples: list[QueueSample]): @@ -173,7 +170,7 @@ def _assemble_gen_batch_output_from_queue_samples(self, queue_samples: list[Queu if not queue_samples: raise ValueError("Empty queue_samples provided for batch assembly") - print(f"Assembling batch from {len(queue_samples)} queue samples") + print(f"[FullyAsyncTrainer] Assembling batch from {len(queue_samples)} queue samples") # Extract data and metadata from all samples sample_data_list = [] @@ -215,7 +212,7 @@ def _assemble_gen_batch_output_from_queue_samples(self, queue_samples: list[Queu "avg_sample_age": np.mean([time.time() - ts for ts in sample_timestamps]), } - print(meta_info) + print(f"[FullyAsyncTrainer] {meta_info}") return batch @@ -254,7 +251,7 @@ def fit(self): to construct the PPO dataflow. The light-weight advantage computation is done on the driver process. """ - print("Starting FullyAsyncTrainer...") + print("[FullyAsyncTrainer] Starting FullyAsyncTrainer...") if self.message_queue_client is None: raise ValueError("MessageQueue client not set. Call set_message_queue_client() first.") if self.param_synchronizer is None: @@ -281,16 +278,6 @@ def fit(self): # Use queue mode, no need for traditional dataloader iterator # Initialize to get the first batch of data while True: - print("while True", flush=True) - - # Check queue status - if self.message_queue_client: - queue_stats = self.message_queue_client.get_statistics() - print(f"Queue status before getting samples: {queue_stats}") - - if queue_stats.get("queue_size", 0) == 0: - print("WARNING: Queue is empty, will block waiting for samples") - metrics = {} timing_raw = {} @@ -302,8 +289,6 @@ def fit(self): if batch is None: break - print("_get_samples_from_queue end") - # # 更新统计信息 # self.processed_samples += len(batch) if isinstance(batch, list) else 1 # @@ -332,20 +317,15 @@ def fit(self): # "statistics/current_param_version": self.current_param_version, # } # ) - print("_process_batch_common") batch, reward_extra_infos_dict = self._process_batch_common(batch, metrics, timing_raw) - print("_log_rollout") self._log_rollout(batch, reward_extra_infos_dict, timing_raw) - print("_check_save_checkpoint") self._check_save_checkpoint(is_last_step, timing_raw) - print("_collect_metrics") # self._collect_metrics(batch, epoch, metrics, timing_raw) # Trigger parameter synchronization after training step - print("_trigger_parameter_sync_after_step") self._trigger_parameter_sync_after_step() - print(f"global_steps: {self.global_steps}") + print(f"[FullyAsyncTrainer] global_steps: {self.global_steps}") self.global_steps += 1 def get_statistics(self) -> dict: @@ -369,11 +349,12 @@ def _trigger_parameter_sync_after_step(self): """ self.current_param_version = self.current_param_version + 1 print( - f"[TRAINER] Triggering parameter sync after " + f"[FullyAsyncTrainer] Triggering parameter sync after " f"training step {self.global_steps}, version: {self.current_param_version}" ) - logger.info( - f"Triggering parameter sync after training step {self.global_steps}, version: {self.current_param_version}" + print( + f"[FullyAsyncTrainer] Triggering parameter sync" + f" after training step {self.global_steps}, version: {self.current_param_version}" ) ray.get(self.param_synchronizer.sync_weights.remote(self.current_param_version)) diff --git a/recipe/fully_async_policy/message_queue.py b/recipe/fully_async_policy/message_queue.py index 3efe982752d..089a703f924 100644 --- a/recipe/fully_async_policy/message_queue.py +++ b/recipe/fully_async_policy/message_queue.py @@ -81,7 +81,7 @@ def put_sample(self, sample: Any, param_version: int) -> bool: with self.lock: # Check freshness staleness = self.current_param_version - param_version - if staleness >= self.staleness_threshold: + if staleness > self.staleness_threshold: self.dropped_samples += 1 logger.debug(f"Dropped stale sample: staleness={staleness}, threshold={self.staleness_threshold}") return False @@ -113,13 +113,11 @@ def get_samples(self, min_batch_count: int = 1) -> list[Any]: List[Any]: List of retrieved samples """ - print("get_samples") with self.lock: while len(self.queue) < min_batch_count and self.running: - print(f"consumer_condition {len(self.queue)}") - for data in self.queue: - if data is None: - return [] + print(f"[MessageQueue] consumer_condition {len(self.queue)}") + if len(self.queue) > 0 and self.queue[-1] is None: + return [] self.consumer_condition.wait() # If queue is closed and doesn't have enough samples, return empty list diff --git a/recipe/fully_async_policy/param_sync.py b/recipe/fully_async_policy/param_sync.py index 11d94c79ae4..3de781959ab 100644 --- a/recipe/fully_async_policy/param_sync.py +++ b/recipe/fully_async_policy/param_sync.py @@ -60,7 +60,7 @@ def _init_weights_info(self): self.rollout_wg.set_actor_weights_info(self.weights_info) def _init_sync_group(self): - print("Initializing parameter synchronization group...") + print("[ParameterSynchronizer] Initializing parameter synchronization group...") actor_rollout_workers = self.actor_wg.workers + self.rollout_wg.workers collective.create_collective_group( actor_rollout_workers, @@ -72,7 +72,7 @@ def _init_sync_group(self): def sync_weights(self, version): self.current_version = version - print(f"Starting weight synchronization (version {self.current_version})...") + print(f"[ParameterSynchronizer] Starting weight synchronization (version {self.current_version})...") ray.get(self.rollouter.pause.remote()) @@ -86,4 +86,4 @@ def sync_weights(self, version): # Update rollout version ray.get(self.rollouter.update_param_version.remote(version)) ray.get(self.rollouter.resume.remote()) - print("sync_weights success") + print("[ParameterSynchronizer] sync_weights success") diff --git a/verl/trainer/ppo/ray_trainer.py b/verl/trainer/ppo/ray_trainer.py index 89acaebfe03..e8398fd0865 100644 --- a/verl/trainer/ppo/ray_trainer.py +++ b/verl/trainer/ppo/ray_trainer.py @@ -1237,7 +1237,6 @@ def _post_generate_batch(self, batch, gen_batch_output, metrics): def _process_batch_common(self, batch, metrics, timing_raw): with marked_timer("reward", timing_raw, color="yellow"): # compute reward model score - print("marked_timer reward") if self.use_rm: reward_tensor = self.rm_wg.compute_rm_score(batch) batch = batch.union(reward_tensor) @@ -1248,7 +1247,6 @@ def _process_batch_common(self, batch, metrics, timing_raw): reward_tensor, reward_extra_infos_dict = compute_reward(batch, self.reward_fn) # recompute old_log_probs with marked_timer("old_log_prob", timing_raw, color="blue"): - print("marked_timer old_log_prob") old_log_prob = self.actor_rollout_wg.compute_log_prob(batch) entropys = old_log_prob.batch["entropys"] @@ -1284,8 +1282,6 @@ def _process_batch_common(self, batch, metrics, timing_raw): } ) if self.use_reference_policy: - print("marked_timer use_reference_policy") - # compute reference log_prob with marked_timer("ref", timing_raw, color="olive"): if not self.ref_in_actor: @@ -1295,12 +1291,10 @@ def _process_batch_common(self, batch, metrics, timing_raw): batch = batch.union(ref_log_prob) # compute values if self.use_critic: - print("marked_timer compute use_critic") with marked_timer("values", timing_raw, color="cyan"): values = self.critic_wg.compute_values(batch) batch = batch.union(values) with marked_timer("adv", timing_raw, color="brown"): - print("marked_timer adv") # we combine with rule-based rm reward_extra_infos_dict: dict[str, list] if self.config.reward_model.launch_reward_fn_async: @@ -1336,7 +1330,6 @@ def _process_batch_common(self, batch, metrics, timing_raw): ) # update critic if self.use_critic: - print("marked_timer update use_critic") with marked_timer("update_critic", timing_raw, color="pink"): critic_output = self.critic_wg.update_critic(batch) critic_output_metrics = reduce_metrics(critic_output.meta_info["metrics"]) @@ -1344,7 +1337,6 @@ def _process_batch_common(self, batch, metrics, timing_raw): # implement critic warmup if self.config.trainer.critic_warmup <= self.global_steps: # update actor - print("marked_timer update_actor") with marked_timer("update_actor", timing_raw, color="red"): batch.meta_info["multi_turn"] = self.config.actor_rollout_ref.rollout.multi_turn.enable actor_output = self.actor_rollout_wg.update_actor(batch) From 444c3d1af644ce710b420bf4f981a40294ce8496 Mon Sep 17 00:00:00 2001 From: ArronHZG Date: Tue, 12 Aug 2025 15:58:56 +0800 Subject: [PATCH 038/182] update message --- .../fully_async_rollouter.py | 36 +++++----- .../fully_async_policy/fully_async_trainer.py | 68 +++++++++---------- recipe/fully_async_policy/message_queue.py | 6 +- tests/special_e2e/run_fully_async_policy.sh | 11 +-- 4 files changed, 55 insertions(+), 66 deletions(-) diff --git a/recipe/fully_async_policy/fully_async_rollouter.py b/recipe/fully_async_policy/fully_async_rollouter.py index 2d9d839feca..97d0f627eb8 100644 --- a/recipe/fully_async_policy/fully_async_rollouter.py +++ b/recipe/fully_async_policy/fully_async_rollouter.py @@ -72,7 +72,7 @@ def __init__( self.use_reference_policy = False self.use_rm = False - print(f"[ROLLOUTER] Creating datasets...") + print(f"[FullyAsyncRollouter] Creating datasets...") from verl.trainer.main_ppo import create_rl_dataset, create_rl_sampler from verl.utils.dataset.rl_dataset import collate_fn @@ -81,7 +81,7 @@ def __init__( train_sampler = create_rl_sampler(config.data, train_dataset) self._validate_config() - print(f"[ROLLOUTER] Rollouter _create_dataloader...\n{train_dataset}\n{val_dataset}") + print(f"[FullyAsyncRollouter] Rollouter _create_dataloader...\n{train_dataset}\n{val_dataset}") self._create_dataloader(train_dataset, val_dataset, collate_fn, train_sampler) total_rollout_steps = len(self.train_dataloader) * self.config.trainer.total_epochs @@ -90,7 +90,7 @@ def __init__( total_rollout_steps = self.config.rollout.total_rollout_steps self.total_rollout_steps = total_rollout_steps - print(f"[ROLLOUTER] Total rollout steps: {self.total_rollout_steps}") + print(f"[FullyAsyncRollouter] Total rollout steps: {self.total_rollout_steps}") # Rollouter parameter configuration self.message_queue_client = None @@ -156,12 +156,12 @@ def update_param_version(self, version: int): self.current_param_version = version # every time param change, reset train_step_samples self.train_step_samples = 0 - print(f"[ROLLOUTER] Parameter version updated from {old_version} to {version}") + print(f"[FullyAsyncRollouter] Parameter version updated from {old_version} to {version}") def _validate_config(self): # Validate asynchronous training configuration if not hasattr(self.config, "async_training"): - raise ValueError("[ROLLOUTER] Missing async_training configuration") + raise ValueError("[FullyAsyncRollouter] Missing async_training configuration") def _create_actor_rollout_classes(self): # only create rollout @@ -189,7 +189,7 @@ def _create_continuous_iterator(self): yield epoch, batch_dict def fit(self): - print(f"[ROLLOUTER] Starting FullyAsyncRollouter...") + print(f"[FullyAsyncRollouter] Starting FullyAsyncRollouter...") if self.message_queue_client is None: raise ValueError("MessageQueue client not set. Call set_message_queue_client() first.") @@ -210,7 +210,7 @@ def fit(self): self.generation_thread.join() self.monitor_thread.join() - print(f"[ROLLOUTER] Rollouter fit completed") + print(f"[FullyAsyncRollouter] Rollouter fit completed") def _generation_loop(self): """ @@ -247,7 +247,7 @@ def _generation_loop(self): if self.val_reward_fn is not None and self.config.trainer.get("val_before_train", True): val_metrics = self._validate() assert val_metrics, f"{val_metrics=}" - pprint(f"[ROLLOUTER] Initial validation metrics: {val_metrics}") + pprint(f"[FullyAsyncRollouter] Initial validation metrics: {val_metrics}") self.logger.log(data=val_metrics, step=self.global_steps) if self.config.trainer.get("val_only", False): return @@ -267,7 +267,7 @@ def _generation_loop(self): self.pause() while self.paused and self.running: - print(f"[ROLLOUTER] Generation thread paused, waiting...") + print(f"[FullyAsyncRollouter] Generation thread paused, waiting...") self.condition.wait() if not self.running: @@ -319,7 +319,7 @@ def _generation_loop(self): self.global_steps += 1 if is_last_step: - pprint(f"[ROLLOUTER] Final validation metrics: {last_val_metrics}") + pprint(f"[FullyAsyncRollouter] Final validation metrics: {last_val_metrics}") break with self.lock: @@ -343,14 +343,14 @@ def _monitor_loop(self): # 定期打印统计信息 current_time = time.time() if current_time - last_stats_time >= stats_interval: - print(f"[ROLLOUTER] {self.get_statistics()}") + print(f"[FullyAsyncRollouter] {self.get_statistics()}") last_stats_time = current_time if not self._should_pause_generation(): with self.lock: if self.paused: self.paused = False self.condition.notify_all() - print(f"[ROLLOUTER] Generation resumed") + print(f"[FullyAsyncRollouter] Generation resumed") def _should_pause_generation(self) -> bool: """Determine whether the build should be paused""" @@ -363,7 +363,7 @@ def _should_pause_generation(self) -> bool: if version_diff > self.staleness_threshold: print( - "[ROLLOUTER] " + "[FullyAsyncRollouter] " f"Should pause due to version_diff > self.staleness_threshold: " f"rollout_version={self.current_param_version}, " f"trainer_version={current_trainer_version}, diff={version_diff}" @@ -371,25 +371,25 @@ def _should_pause_generation(self) -> bool: return True if queue_size >= self.max_queue_size: - print(f"[ROLLOUTER] Should pause due to full queue: size={queue_size}, max={self.max_queue_size}") + print(f"[FullyAsyncRollouter] Should pause due to full queue: size={queue_size}, max={self.max_queue_size}") return True if self.train_step_samples >= self.max_required_samples: - print(f"[ROLLOUTER] Should pause due to step_generated_samples >= max_required_samples: " + print(f"[FullyAsyncRollouter] Should pause due to step_generated_samples >= max_required_samples: " f"self.step_generated_samples={self.train_step_samples}, max={self.max_required_samples}") return True return False except Exception as e: - print(f"[ROLLOUTER] Error checking pause conditions: {e}") + print(f"[FullyAsyncRollouter] Error checking pause conditions: {e}") return True def pause(self) -> bool: """pause rollout TODO integrated Partial Rollout """ - print(f"[ROLLOUTER] pause") + print(f"[FullyAsyncRollouter] pause") with self.lock: if not self.running: return False @@ -404,7 +404,7 @@ def resume(self) -> bool: """resume rollout TODO integrated Partial Rollout """ - print(f"[ROLLOUTER] resume") + print(f"[FullyAsyncRollouter] resume") with self.lock: if not self.running: return False diff --git a/recipe/fully_async_policy/fully_async_trainer.py b/recipe/fully_async_policy/fully_async_trainer.py index 418ab024d0a..70f30a180f8 100644 --- a/recipe/fully_async_policy/fully_async_trainer.py +++ b/recipe/fully_async_policy/fully_async_trainer.py @@ -135,15 +135,17 @@ def _get_samples_from_queue(self) -> tuple[None, None] | tuple[int, Any]: # Get samples from queue consumer_start = time.time() - queue_samples = self.message_queue_client.get_samples(min_batch_count=required_samples) + queue_samples, queue_len = self.message_queue_client.get_samples(min_batch_count=required_samples) consumer_end = time.time() if not queue_samples or len(queue_samples) == 0: logger.warning("required_samples is empty") return None, None - print(f"[FullyAsyncTrainer] Retrieved {len(queue_samples)} samples from queue." - f"wait time {consumer_end - consumer_start:.2f} seconds.") + print(f"[FullyAsyncTrainer] Retrieved {len(queue_samples)} samples from queue. " + f"wait time {consumer_end - consumer_start:.2f} seconds. " + f"queue len {queue_len}. " + ) queue_samples = [ray.cloudpickle.loads(x) for x in queue_samples] # Assemble batch @@ -289,34 +291,34 @@ def fit(self): if batch is None: break - # # 更新统计信息 - # self.processed_samples += len(batch) if isinstance(batch, list) else 1 - # - # # 从meta_info中获取参数版本信息 - # if hasattr(batch, "meta_info") and batch.meta_info: - # rollout_param_versions = batch.meta_info.get("rollout_param_versions", []) - # if rollout_param_versions: - # # 统计陈旧样本 - # stale_count = sum(1 for v in rollout_param_versions if self.current_param_version - v > 1) - # self.stale_samples_processed += stale_count - # - # # 添加新鲜度指标到metrics - # if rollout_param_versions: - # param_version_diversity = batch.meta_info.get("param_version_diversity", 0) - # avg_sample_age = batch.meta_info.get("avg_sample_age", 0) - # - # metrics.update( - # { - # "freshness/param_version_diversity": param_version_diversity, - # "freshness/avg_sample_age": avg_sample_age, - # "freshness/stale_samples_ratio": stale_count / len(rollout_param_versions) - # if rollout_param_versions - # else 0, - # "statistics/processed_samples": self.processed_samples, - # "statistics/stale_samples_processed": self.stale_samples_processed, - # "statistics/current_param_version": self.current_param_version, - # } - # ) + # 更新统计信息 + self.processed_samples += len(batch) if isinstance(batch, list) else 1 + + # 从meta_info中获取参数版本信息 + if hasattr(batch, "meta_info") and batch.meta_info: + rollout_param_versions = batch.meta_info.get("rollout_param_versions", []) + if rollout_param_versions: + # 统计陈旧样本 + stale_count = sum(1 for v in rollout_param_versions if self.current_param_version - v > 1) + self.stale_samples_processed += stale_count + + # 添加新鲜度指标到metrics + if rollout_param_versions: + param_version_diversity = batch.meta_info.get("param_version_diversity", 0) + avg_sample_age = batch.meta_info.get("avg_sample_age", 0) + + metrics.update( + { + "freshness/param_version_diversity": param_version_diversity, + "freshness/avg_sample_age": avg_sample_age, + "freshness/stale_samples_ratio": stale_count / len(rollout_param_versions) + if rollout_param_versions + else 0, + "statistics/processed_samples": self.processed_samples, + "statistics/stale_samples_processed": self.stale_samples_processed, + "statistics/current_param_version": self.current_param_version, + } + ) batch, reward_extra_infos_dict = self._process_batch_common(batch, metrics, timing_raw) self._log_rollout(batch, reward_extra_infos_dict, timing_raw) self._check_save_checkpoint(is_last_step, timing_raw) @@ -352,10 +354,6 @@ def _trigger_parameter_sync_after_step(self): f"[FullyAsyncTrainer] Triggering parameter sync after " f"training step {self.global_steps}, version: {self.current_param_version}" ) - print( - f"[FullyAsyncTrainer] Triggering parameter sync" - f" after training step {self.global_steps}, version: {self.current_param_version}" - ) ray.get(self.param_synchronizer.sync_weights.remote(self.current_param_version)) def _compute_sample_freshness_metrics(self, batch_samples: list[QueueSample]) -> dict: diff --git a/recipe/fully_async_policy/message_queue.py b/recipe/fully_async_policy/message_queue.py index 089a703f924..bae34c84e47 100644 --- a/recipe/fully_async_policy/message_queue.py +++ b/recipe/fully_async_policy/message_queue.py @@ -102,7 +102,7 @@ def put_sample(self, sample: Any, param_version: int) -> bool: return True - def get_samples(self, min_batch_count: int = 1) -> list[Any]: + def get_samples(self, min_batch_count: int = 1) -> tuple[list[Any], int]: """ Get batch samples from the queue, wait until enough samples are available @@ -136,7 +136,7 @@ def get_samples(self, min_batch_count: int = 1) -> list[Any]: samples.append(data) self.total_consumed += len(samples) - return samples + return samples, len(self.queue) def update_param_version(self, version: int): """Update current parameter version""" @@ -217,7 +217,7 @@ def put_sample(self, sample: Any, param_version: int) -> bool: """Put batch into queue""" return ray.get(self.queue_actor.put_sample.remote(sample, param_version)) - def get_samples(self, min_batch_count: int = 1) -> list[Any]: + def get_samples(self, min_batch_count: int = 1) -> tuple[list[Any], int]: """Get batch from queue, wait until enough samples are available""" return ray.get(self.queue_actor.get_samples.remote(min_batch_count)) diff --git a/tests/special_e2e/run_fully_async_policy.sh b/tests/special_e2e/run_fully_async_policy.sh index c95476e898a..27c033abc1d 100644 --- a/tests/special_e2e/run_fully_async_policy.sh +++ b/tests/special_e2e/run_fully_async_policy.sh @@ -51,13 +51,6 @@ n_gpus_training=$((NUM_GPUS - n_gpus_rollout)) # Async training specific configurations staleness_threshold=3 -min_batch_count=1 -batch_timeout=30.0 -generation_timeout=30.0 -batch_generation_interval=0.1 -max_sync_retries=3 -sync_timeout=30.0 -sync_retry_delay=1.0 exp_name="$(basename "${MODEL_ID,,}")-fully-async-policy-${ACTOR_STRATEGY}-minimal" @@ -120,11 +113,9 @@ common_params=( rollout.nnodes=1 rollout.n_gpus_per_node=${n_gpus_rollout} rollout.total_rollout_steps=10 - rollout.total_epochs=10 + rollout.total_epochs=2 # Fully async specific configurations async_training.staleness_threshold=${staleness_threshold} - async_training.sync_timeout=${sync_timeout} - async_training.sync_retry_delay=${sync_retry_delay} ) if [ "${ACTOR_STRATEGY}" == "fsdp2" ]; then From bd7520703c411b36ca9697e5aa677ad028d79b38 Mon Sep 17 00:00:00 2001 From: ArronHZG Date: Tue, 12 Aug 2025 18:45:05 +0800 Subject: [PATCH 039/182] sync weight time --- .../fully_async_rollouter.py | 27 ++++++++++--------- recipe/fully_async_policy/message_queue.py | 6 ++--- recipe/fully_async_policy/param_sync.py | 7 ++++- 3 files changed, 23 insertions(+), 17 deletions(-) diff --git a/recipe/fully_async_policy/fully_async_rollouter.py b/recipe/fully_async_policy/fully_async_rollouter.py index 97d0f627eb8..81fee5c6074 100644 --- a/recipe/fully_async_policy/fully_async_rollouter.py +++ b/recipe/fully_async_policy/fully_async_rollouter.py @@ -282,13 +282,14 @@ def _generation_loop(self): is_last_step = self.global_steps >= self.total_rollout_steps # generate a batch - with marked_timer("gen", timing_raw, color="red"): - if not self.async_rollout_mode: - gen_batch_output = self.actor_rollout_wg.generate_sequences(gen_batch) - else: - gen_batch_output = self.async_rollout_manager.generate_sequences(gen_batch) - timing_raw.update(gen_batch_output.meta_info["timing"]) - gen_batch_output.meta_info.pop("timing", None) + with self.lock: + with marked_timer("gen", timing_raw, color="red"): + if not self.async_rollout_mode: + gen_batch_output = self.actor_rollout_wg.generate_sequences(gen_batch) + else: + gen_batch_output = self.async_rollout_manager.generate_sequences(gen_batch) + timing_raw.update(gen_batch_output.meta_info["timing"]) + gen_batch_output.meta_info.pop("timing", None) if gen_batch_output is not None: # prepare rollout metadata @@ -332,6 +333,10 @@ def _generation_loop(self): ) def _monitor_loop(self): + """ + Function 1: Log information output + Function 2: Trigger rollout recovery + """ last_stats_time = time.time() stats_interval = 30.0 check_interval = 5.0 @@ -346,11 +351,7 @@ def _monitor_loop(self): print(f"[FullyAsyncRollouter] {self.get_statistics()}") last_stats_time = current_time if not self._should_pause_generation(): - with self.lock: - if self.paused: - self.paused = False - self.condition.notify_all() - print(f"[FullyAsyncRollouter] Generation resumed") + self.resume() def _should_pause_generation(self) -> bool: """Determine whether the build should be paused""" @@ -413,7 +414,7 @@ def resume(self) -> bool: return True self.paused = False - self.condition.notify_all() + self.actor_rollout_wg.resume() return True def get_statistics(self) -> dict: diff --git a/recipe/fully_async_policy/message_queue.py b/recipe/fully_async_policy/message_queue.py index bae34c84e47..e86c006106e 100644 --- a/recipe/fully_async_policy/message_queue.py +++ b/recipe/fully_async_policy/message_queue.py @@ -117,12 +117,12 @@ def get_samples(self, min_batch_count: int = 1) -> tuple[list[Any], int]: while len(self.queue) < min_batch_count and self.running: print(f"[MessageQueue] consumer_condition {len(self.queue)}") if len(self.queue) > 0 and self.queue[-1] is None: - return [] + return [], len(self.queue) self.consumer_condition.wait() # If queue is closed and doesn't have enough samples, return empty list if not self.running and len(self.queue) < min_batch_count: - return [] + return [], len(self.queue) # Get specified number of samples batch_count = min(min_batch_count, len(self.queue)) @@ -131,7 +131,7 @@ def get_samples(self, min_batch_count: int = 1) -> tuple[list[Any], int]: if self.queue: data = self.queue.popleft() if data is None: - return [] + return [], len(self.queue) else: samples.append(data) diff --git a/recipe/fully_async_policy/param_sync.py b/recipe/fully_async_policy/param_sync.py index 3de781959ab..ccf62462264 100644 --- a/recipe/fully_async_policy/param_sync.py +++ b/recipe/fully_async_policy/param_sync.py @@ -13,6 +13,7 @@ # limitations under the License. import logging +import time import ray from ray.util.collective import collective @@ -71,6 +72,8 @@ def _init_sync_group(self): ) def sync_weights(self, version): + start_time = time.time() + self.current_version = version print(f"[ParameterSynchronizer] Starting weight synchronization (version {self.current_version})...") @@ -86,4 +89,6 @@ def sync_weights(self, version): # Update rollout version ray.get(self.rollouter.update_param_version.remote(version)) ray.get(self.rollouter.resume.remote()) - print("[ParameterSynchronizer] sync_weights success") + end_time = time.time() + + print(f"[ParameterSynchronizer] sync_weights success. cost {end_time - start_time} seconds") From 57b93b7be195d4c7c9d8e100706f64db2075cb58 Mon Sep 17 00:00:00 2001 From: ArronHZG Date: Wed, 13 Aug 2025 10:11:21 +0800 Subject: [PATCH 040/182] total batch to mini batch --- .../dapo_7b_math_fsdp2_4_12.sh | 148 ++++++++++++++++++ .../fully_async_rollouter.py | 2 +- recipe/fully_async_policy/param_sync.py | 2 +- 3 files changed, 150 insertions(+), 2 deletions(-) create mode 100644 recipe/fully_async_policy/dapo_7b_math_fsdp2_4_12.sh diff --git a/recipe/fully_async_policy/dapo_7b_math_fsdp2_4_12.sh b/recipe/fully_async_policy/dapo_7b_math_fsdp2_4_12.sh new file mode 100644 index 00000000000..d2f9fa2d6f0 --- /dev/null +++ b/recipe/fully_async_policy/dapo_7b_math_fsdp2_4_12.sh @@ -0,0 +1,148 @@ +#!/usr/bin/env bash +set -xeuo pipefail + +project_name='DAPO' +exp_name='DAPO-Qwen2.5-7b-MATH-0527a1-fsdp2-one-step-off-4-12' + +# Ray +# RAY_ADDRESS=${RAY_ADDRESS:-"http://localhost:8265"} +# WORKING_DIR=${WORKING_DIR:-"${PWD}"} +# RUNTIME_ENV=${RUNTIME_ENV:-"${WORKING_DIR}/verl/trainer/runtime_env.yaml"} +# Paths +RAY_DATA_HOME=${RAY_DATA_HOME:-"${HOME}/verl"} +# very important! please modify the max_position_embeddings in config.json to 32768 after downloading from huggingface +MODEL_PATH=${MODEL_PATH:-"${RAY_DATA_HOME}/models/Qwen2.5-Math-7B"} +CKPTS_DIR=${CKPTS_DIR:-"${RAY_DATA_HOME}/ckpts/${project_name}/${exp_name}"} +TRAIN_FILE=${TRAIN_FILE:-"${RAY_DATA_HOME}/data/dapo-math-17k.parquet"} +TEST_FILE=${TEST_FILE:-"${RAY_DATA_HOME}/data/aime-2024.parquet"} + +MODEL_PATH=/mnt/dolphinfs/hdd_pool/docker/user/hadoop-djst-algoplat/houzhenggang/modelscope/hub/models/Qwen/Qwen2___5-Math-7B +CKPTS_DIR=./ckpts/${project_name}/${exp_name} +TRAIN_FILE=/mnt/dolphinfs/hdd_pool/docker/user/hadoop-djst-algoplat/houzhenggang/data/dapo/dapo-math-17k.parquet +TEST_FILE=/mnt/dolphinfs/hdd_pool/docker/user/hadoop-djst-algoplat/houzhenggang/data/dapo/aime-2024.parquet + + +adv_estimator=grpo + +use_kl_in_reward=False +kl_coef=0.0 +use_kl_loss=False +kl_loss_coef=0.0 + +clip_ratio_low=0.2 +clip_ratio_high=0.28 + +max_prompt_length=$((1024 * 2)) +max_response_length=$((1024 * 8)) +enable_overlong_buffer=True +overlong_buffer_len=$((1024 * 4)) +overlong_penalty_factor=1.0 + +loss_agg_mode="token-mean" + +train_prompt_bsz=4 +gen_prompt_bsz=1 +n_resp_per_prompt=16 +train_prompt_mini_bsz=32 +train_sync_weight_steps=64 + +# Algorithm +temperature=1.0 +top_p=1.0 +top_k=-1 # 0 for HF rollout, -1 for vLLM rollout +val_top_p=0.7 + +# Performance Related Parameter +use_dynamic_bsz=True +actor_ppo_max_token_len=$(((max_prompt_length + max_response_length) * 2)) +infer_ppo_max_token_len=$(((max_prompt_length + max_response_length) * 3)) +ref_offload=True +actor_offload=False +gen_tp=1 +sp_size=1 +fsdp_size=2 + +staleness_threshold=3 + +NNODES=${NNODES:-1} +NGPUS_PER_NODE=${NGPUS_PER_NODE:-8} + +n_gpus_rollout=4 +n_gpus_training=$((NGPUS_PER_NODE - n_gpus_rollout)) + +/home/hadoop-djst-algoplat/miniconda3/bin/python -m recipe.fully_async_policy.fully_async_main \ + data.train_files="${TRAIN_FILE}" \ + data.val_files="${TEST_FILE}" \ + data.prompt_key=prompt \ + data.truncation='left' \ + data.max_prompt_length=${max_prompt_length} \ + data.max_response_length=${max_response_length} \ + data.train_batch_size=${train_prompt_bsz} \ + data.gen_batch_size=${gen_prompt_bsz} \ + actor_rollout_ref.rollout.n=${n_resp_per_prompt} \ + algorithm.adv_estimator=${adv_estimator} \ + algorithm.use_kl_in_reward=${use_kl_in_reward} \ + algorithm.kl_ctrl.kl_coef=${kl_coef} \ + actor_rollout_ref.actor.strategy=fsdp2 \ + critic.strategy=fsdp2 \ + actor_rollout_ref.actor.use_kl_loss=${use_kl_loss} \ + actor_rollout_ref.actor.kl_loss_coef=${kl_loss_coef} \ + actor_rollout_ref.actor.clip_ratio_low=${clip_ratio_low} \ + actor_rollout_ref.actor.clip_ratio_high=${clip_ratio_high} \ + actor_rollout_ref.actor.clip_ratio_c=10.0 \ + actor_rollout_ref.model.use_remove_padding=True \ + actor_rollout_ref.hybrid_engine=False \ + +actor_rollout_ref.model.override_config.max_position_embeddings=32768 \ + actor_rollout_ref.actor.use_dynamic_bsz=${use_dynamic_bsz} \ + actor_rollout_ref.ref.log_prob_use_dynamic_bsz=${use_dynamic_bsz} \ + actor_rollout_ref.rollout.log_prob_use_dynamic_bsz=${use_dynamic_bsz} \ + actor_rollout_ref.actor.ppo_max_token_len_per_gpu=${actor_ppo_max_token_len} \ + actor_rollout_ref.ref.log_prob_max_token_len_per_gpu=${infer_ppo_max_token_len} \ + actor_rollout_ref.rollout.log_prob_max_token_len_per_gpu=${infer_ppo_max_token_len} \ + actor_rollout_ref.model.path="${MODEL_PATH}" \ + actor_rollout_ref.actor.optim.lr=1e-6 \ + actor_rollout_ref.actor.optim.lr_warmup_steps=10 \ + actor_rollout_ref.actor.optim.weight_decay=0.1 \ + actor_rollout_ref.actor.ppo_mini_batch_size=${train_prompt_mini_bsz} \ + actor_rollout_ref.actor.fsdp_config.param_offload=${actor_offload} \ + actor_rollout_ref.actor.fsdp_config.optimizer_offload=${actor_offload} \ + actor_rollout_ref.actor.entropy_coeff=0 \ + actor_rollout_ref.actor.grad_clip=1.0 \ + actor_rollout_ref.actor.loss_agg_mode=${loss_agg_mode} \ + actor_rollout_ref.actor.ulysses_sequence_parallel_size=${sp_size} \ + actor_rollout_ref.rollout.gpu_memory_utilization=0.80 \ + actor_rollout_ref.rollout.tensor_model_parallel_size=${gen_tp} \ + actor_rollout_ref.rollout.enable_chunked_prefill=True \ + actor_rollout_ref.rollout.max_num_batched_tokens=$((max_prompt_length + max_response_length)) \ + actor_rollout_ref.rollout.temperature=${temperature} \ + actor_rollout_ref.rollout.top_p=${top_p} \ + actor_rollout_ref.rollout.top_k=${top_k} \ + actor_rollout_ref.rollout.val_kwargs.temperature=${temperature} \ + actor_rollout_ref.rollout.val_kwargs.top_p=${val_top_p} \ + actor_rollout_ref.rollout.val_kwargs.top_k=${top_k} \ + actor_rollout_ref.rollout.val_kwargs.do_sample=True \ + actor_rollout_ref.rollout.val_kwargs.n=1 \ + actor_rollout_ref.ref.fsdp_config.param_offload=${ref_offload} \ + actor_rollout_ref.ref.ulysses_sequence_parallel_size=${sp_size} \ + actor_rollout_ref.actor.fsdp_config.fsdp_size=${fsdp_size} \ + reward_model.reward_manager=dapo \ + +reward_model.reward_kwargs.overlong_buffer_cfg.enable=${enable_overlong_buffer} \ + +reward_model.reward_kwargs.overlong_buffer_cfg.len=${overlong_buffer_len} \ + +reward_model.reward_kwargs.overlong_buffer_cfg.penalty_factor=${overlong_penalty_factor} \ + +reward_model.reward_kwargs.overlong_buffer_cfg.log=False \ + +reward_model.reward_kwargs.max_resp_len=${max_response_length} \ + trainer.logger=['console','tensorboard'] \ + trainer.project_name="${project_name}" \ + trainer.experiment_name="${exp_name}" \ + trainer.val_before_train=True \ + trainer.test_freq=10 \ + trainer.save_freq=-1 \ + trainer.default_local_dir="${CKPTS_DIR}" \ + trainer.resume_mode=auto \ + trainer.nnodes="${NNODES}" \ + trainer.n_gpus_per_node="${n_gpus_training}" \ + rollout.nnodes="${NNODES}" \ + rollout.n_gpus_per_node="${n_gpus_rollout}" \ + rollout.total_rollout_steps=100 \ + rollout.total_epochs=2 \ + async_training.staleness_threshold=${staleness_threshold} diff --git a/recipe/fully_async_policy/fully_async_rollouter.py b/recipe/fully_async_policy/fully_async_rollouter.py index 81fee5c6074..e3bb3e0652b 100644 --- a/recipe/fully_async_policy/fully_async_rollouter.py +++ b/recipe/fully_async_policy/fully_async_rollouter.py @@ -414,7 +414,7 @@ def resume(self) -> bool: return True self.paused = False - self.actor_rollout_wg.resume() + self.condition.notify_all() return True def get_statistics(self) -> dict: diff --git a/recipe/fully_async_policy/param_sync.py b/recipe/fully_async_policy/param_sync.py index ccf62462264..7e40e755a12 100644 --- a/recipe/fully_async_policy/param_sync.py +++ b/recipe/fully_async_policy/param_sync.py @@ -91,4 +91,4 @@ def sync_weights(self, version): ray.get(self.rollouter.resume.remote()) end_time = time.time() - print(f"[ParameterSynchronizer] sync_weights success. cost {end_time - start_time} seconds") + print(f"[ParameterSynchronizer] sync_weights success. cost {end_time - start_time:.2f} seconds") From aeb4056f611398a72be46d782b4016c61e00d4a6 Mon Sep 17 00:00:00 2001 From: ArronHZG Date: Wed, 13 Aug 2025 15:35:21 +0800 Subject: [PATCH 041/182] StreamRL batch --- recipe/fully_async_policy/dapo_7b_math_fsdp2_4_12.sh | 4 ++-- recipe/fully_async_policy/fully_async_rollouter.py | 6 ++++++ recipe/fully_async_policy/fully_async_trainer.py | 6 ++++-- 3 files changed, 12 insertions(+), 4 deletions(-) diff --git a/recipe/fully_async_policy/dapo_7b_math_fsdp2_4_12.sh b/recipe/fully_async_policy/dapo_7b_math_fsdp2_4_12.sh index d2f9fa2d6f0..5c2ac5e6017 100644 --- a/recipe/fully_async_policy/dapo_7b_math_fsdp2_4_12.sh +++ b/recipe/fully_async_policy/dapo_7b_math_fsdp2_4_12.sh @@ -40,8 +40,8 @@ overlong_penalty_factor=1.0 loss_agg_mode="token-mean" -train_prompt_bsz=4 -gen_prompt_bsz=1 +train_prompt_bsz=2 +gen_prompt_bsz=4 n_resp_per_prompt=16 train_prompt_mini_bsz=32 train_sync_weight_steps=64 diff --git a/recipe/fully_async_policy/fully_async_rollouter.py b/recipe/fully_async_policy/fully_async_rollouter.py index e3bb3e0652b..0c2574e5f06 100644 --- a/recipe/fully_async_policy/fully_async_rollouter.py +++ b/recipe/fully_async_policy/fully_async_rollouter.py @@ -283,6 +283,7 @@ def _generation_loop(self): # generate a batch with self.lock: + start_time = time.time() with marked_timer("gen", timing_raw, color="red"): if not self.async_rollout_mode: gen_batch_output = self.actor_rollout_wg.generate_sequences(gen_batch) @@ -290,6 +291,8 @@ def _generation_loop(self): gen_batch_output = self.async_rollout_manager.generate_sequences(gen_batch) timing_raw.update(gen_batch_output.meta_info["timing"]) gen_batch_output.meta_info.pop("timing", None) + end_time = time.time() + print(f"[FullyAsyncRollouter] rollout time {end_time - start_time:.2f} seconds") if gen_batch_output is not None: # prepare rollout metadata @@ -300,6 +303,7 @@ def _generation_loop(self): } batch = self._post_generate_batch(batch, gen_batch_output, metrics) + start_time = time.time() for sample in batch: # for sample in samples: queue_sample = QueueSample( @@ -316,6 +320,8 @@ def _generation_loop(self): self.train_step_samples += 1 else: self.dropped_stale_samples += 1 + end_time = time.time() + print(f"[FullyAsyncRollouter] mq push time {end_time - start_time:.2f} seconds") self.global_steps += 1 diff --git a/recipe/fully_async_policy/fully_async_trainer.py b/recipe/fully_async_policy/fully_async_trainer.py index 70f30a180f8..4ee8fa52332 100644 --- a/recipe/fully_async_policy/fully_async_trainer.py +++ b/recipe/fully_async_policy/fully_async_trainer.py @@ -121,7 +121,6 @@ def _get_samples_from_queue(self) -> tuple[None, None] | tuple[int, Any]: Returns: tuple: (epoch, batch_dict, gen_batch_output) """ - # Calculate the number of samples needed n_responses_per_prompt = self.config.actor_rollout_ref.rollout.n batch_size = self.config.data.train_batch_size @@ -165,6 +164,8 @@ def _assemble_gen_batch_output_from_queue_samples(self, queue_samples: list[Queu Returns: DataProto: Assembled gen_batch_output """ + start_time = time.time() + import numpy as np from verl.protocol import DataProto @@ -214,7 +215,8 @@ def _assemble_gen_batch_output_from_queue_samples(self, queue_samples: list[Queu "avg_sample_age": np.mean([time.time() - ts for ts in sample_timestamps]), } - print(f"[FullyAsyncTrainer] {meta_info}") + end_time = time.time() + print(f"[FullyAsyncTrainer] {meta_info} time elapsed: {end_time - start_time:.2f} seconds") return batch From 6c9d615e3a8fc47828213bcee9747b00069f255f Mon Sep 17 00:00:00 2001 From: ArronHZG Date: Thu, 14 Aug 2025 03:58:55 +0800 Subject: [PATCH 042/182] stream rollout --- recipe/fully_async_policy/fully_async_main.py | 9 +- .../fully_async_rollouter.py | 449 ++++++++++++------ .../fully_async_policy/fully_async_trainer.py | 143 ++++-- recipe/fully_async_policy/message_queue.py | 24 + .../simple_streaming_demo.py | 176 +++++++ recipe/one_step_off_policy/ray_trainer.py | 28 +- verl/experimental/agent_loop/agent_loop.py | 245 +++++++--- verl/trainer/main_ppo.py | 2 +- verl/trainer/ppo/ray_trainer.py | 1 - 9 files changed, 797 insertions(+), 280 deletions(-) create mode 100644 recipe/fully_async_policy/simple_streaming_demo.py diff --git a/recipe/fully_async_policy/fully_async_main.py b/recipe/fully_async_policy/fully_async_main.py index 163b2420381..179929f242a 100644 --- a/recipe/fully_async_policy/fully_async_main.py +++ b/recipe/fully_async_policy/fully_async_main.py @@ -187,10 +187,11 @@ def _initialize_components(self, config) -> None: self.components["reward_fn"] = reward_fn self.components["val_reward_fn"] = val_reward_fn - self.max_queue_size = ((config.async_training.staleness_threshold + 1) - * config.data.train_batch_size - * config.actor_rollout_ref.rollout.n - ) * 10 # x 10 avoid deadlock + self.max_queue_size = ( + (config.async_training.staleness_threshold + 1) + * config.data.train_batch_size + * config.actor_rollout_ref.rollout.n + ) * 10 # x 10 avoid deadlock print("[ASYNC MAIN] Creating MessageQueue...") message_queue = MessageQueue.remote(config, self.max_queue_size) message_queue_client = MessageQueueClient(message_queue) diff --git a/recipe/fully_async_policy/fully_async_rollouter.py b/recipe/fully_async_policy/fully_async_rollouter.py index 0c2574e5f06..ae74cc838b1 100644 --- a/recipe/fully_async_policy/fully_async_rollouter.py +++ b/recipe/fully_async_policy/fully_async_rollouter.py @@ -11,9 +11,8 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -import threading +import asyncio import time -from concurrent.futures import ThreadPoolExecutor from pprint import pprint import ray @@ -22,7 +21,6 @@ from recipe.fully_async_policy.message_queue import MessageQueueClient, QueueSample from verl.single_controller.ray import RayClassWithInitArgs, RayWorkerGroup from verl.trainer.ppo.ray_trainer import RayPPOTrainer, ResourcePoolManager, Role, WorkerType -from verl.utils.debug import marked_timer from verl.utils.tracking import ValidationGenerationsLogger @@ -35,17 +33,17 @@ class FullyAsyncRollouter(RayPPOTrainer): """ def __init__( - self, - config, - tokenizer, - role_worker_mapping: dict[Role, WorkerType], - resource_pool_manager: ResourcePoolManager, - ray_worker_group_cls: RayWorkerGroup = RayWorkerGroup, - processor=None, - reward_fn=None, - val_reward_fn=None, - device_name=None, - max_queue_size=1000, + self, + config, + tokenizer, + role_worker_mapping: dict[Role, WorkerType], + resource_pool_manager: ResourcePoolManager, + ray_worker_group_cls: RayWorkerGroup = RayWorkerGroup, + processor=None, + reward_fn=None, + val_reward_fn=None, + device_name=None, + max_queue_size=1000, ): # Store the tokenizer for text processing self.tokenizer = tokenizer @@ -72,7 +70,7 @@ def __init__( self.use_reference_policy = False self.use_rm = False - print(f"[FullyAsyncRollouter] Creating datasets...") + print("[FullyAsyncRollouter] Creating datasets...") from verl.trainer.main_ppo import create_rl_dataset, create_rl_sampler from verl.utils.dataset.rl_dataset import collate_fn @@ -82,6 +80,9 @@ def __init__( self._validate_config() print(f"[FullyAsyncRollouter] Rollouter _create_dataloader...\n{train_dataset}\n{val_dataset}") + + assert self.config.data.gen_batch_size == 1, "gen_batch_size must be one" + self._create_dataloader(train_dataset, val_dataset, collate_fn, train_sampler) total_rollout_steps = len(self.train_dataloader) * self.config.trainer.total_epochs @@ -119,11 +120,9 @@ def __init__( # Concurrency control self.running = False self.paused = False - self.generation_thread = None - self.monitor_thread = None - self.thread_executor = ThreadPoolExecutor(max_workers=2) - self.lock = threading.RLock() - self.condition = threading.Condition(self.lock) + # Initialize async locks directly - asyncio.Lock() creation is synchronous + self.lock = asyncio.Lock() + self.condition = asyncio.Condition(self.lock) # Pause/resume statistics self.total_pause_time = 0.0 @@ -135,23 +134,34 @@ def __init__( # queue size self.max_queue_size = max_queue_size - def set_message_queue_client(self, message_queue_client: MessageQueueClient): + self.async_rollout_manager = None + + # 流式处理相关配置 + self.max_concurrent_samples = async_config.get("max_concurrent_samples", 512) # 最大并发处理样本数 + + # 流式处理统计 + self.max_processing_time = 0.0 # 最长处理时间 + self.processed_sample_count = 0 # 已处理的样本计数 + self.active_sample_count = 0 # 当前正在处理的样本数 + self.queue_full_pause_count = 0 # 队列满导致的暂停次数 + + async def set_message_queue_client(self, message_queue_client: MessageQueueClient): """Set message queue client""" - with self.lock: + async with self.lock: self.message_queue_client = message_queue_client - def set_parameter_synchronizer(self, param_synchronizer): + async def set_parameter_synchronizer(self, param_synchronizer): """Set parameter synchronizer""" - with self.lock: + async with self.lock: self.param_synchronizer = param_synchronizer def get_rollout_wg(self): """Get rollout worker group""" return self.rollout_wg - def update_param_version(self, version: int): + async def update_param_version(self, version: int): """Update current parameter version""" - with self.lock: + async with self.lock: old_version = self.current_param_version self.current_param_version = version # every time param change, reset train_step_samples @@ -188,46 +198,163 @@ def _create_continuous_iterator(self): for batch_dict in iterator: yield epoch, batch_dict - def fit(self): - print(f"[FullyAsyncRollouter] Starting FullyAsyncRollouter...") + def _init_async_rollout_manager(self): + # create async rollout manager and request scheduler + assert self.config.actor_rollout_ref.rollout.mode == "async" + from verl.experimental.agent_loop import AgentLoopManager - if self.message_queue_client is None: - raise ValueError("MessageQueue client not set. Call set_message_queue_client() first.") - if self.param_synchronizer is None: - raise ValueError("param_synchronizer client not set. Call set_parameter_synchronizer() first.") + self.async_rollout_mode = True + self.async_rollout_manager = AgentLoopManager( + config=self.config, + worker_group=self.rollout_wg, + ) - # 设置运行状态 - with self.lock: - self.running = True - self.paused = False + # 添加样本到待处理队列的协程 + async def _feed_samples(self): + continuous_iterator = self._create_continuous_iterator() + sample_count = 0 + for epoch, batch_dict in continuous_iterator: + # 准备样本数据 + sample_id = f"sample_{epoch}_{sample_count}" + batch, gen_batch = self._prepare_generate_batch(batch_dict) - self.generation_thread = threading.Thread(target=self._generation_loop, daemon=True) - self.generation_thread.start() + sample_data = {"sample_id": sample_id, "gen_batch": gen_batch, "epoch": epoch, "timestamp": time.time()} - self.monitor_thread = threading.Thread(target=self._monitor_loop, daemon=True) - self.monitor_thread.start() + await self.pending_samples_queue.put(sample_data) + sample_count += 1 - self.generation_thread.join() - self.monitor_thread.join() + # 检查是否到达最后一步 + if self.global_steps >= self.total_rollout_steps: + print("[FullyAsyncRollouter] 达到最大步数,停止添加新样本") + break - print(f"[FullyAsyncRollouter] Rollouter fit completed") + self.global_steps += 1 - def _generation_loop(self): - """ + # 发送结束信号 + await self.pending_samples_queue.put("DONE") + + async def _submit_worker(self): + """流式处理工作协程 - 逐个样本立即提交处理,不等待批次""" + active_tasks = set() - Main Generation Loop + while True: + # 获取待处理样本 + sample_data = await self.pending_samples_queue.get() + + if sample_data == "DONE": + print("收到结束信号,等待剩余任务完成...") + # 等待所有活动任务完成 + if active_tasks: + await asyncio.gather(*active_tasks, return_exceptions=True) + break + + # 检查并发数是否超限 + while len(active_tasks) >= self.max_concurrent_samples: + print(f"达到最大并发数 {self.max_concurrent_samples},等待任务完成...") + # 等待至少一个任务完成 + done_tasks, active_tasks = await asyncio.wait(active_tasks, return_when=asyncio.FIRST_COMPLETED) + # 清理已完成的任务 + for task in done_tasks: + await task + + # 立即提交单个样本处理 + task = asyncio.create_task( + self._process_single_sample_streaming(sample_data), name=f"process_{sample_data['sample_id']}" + ) + active_tasks.add(task) + + # 标记队列任务完成 + self.pending_samples_queue.task_done() + + async def _process_single_sample_streaming(self, sample_data: dict): + """流式处理单个样本""" + # 检查是否需要暂停处理 + if await self._should_pause_generation(): + print(f"[FullyAsyncRollouter] 暂停处理样本 {sample_data['sample_id']}") + # 暂停时重新放回队列 + await self.pending_samples_queue.put(sample_data) + return + + start_time = time.time() + # 直接使用AgentLoopManager的单样本异步处理能力 + agent_loop_output, processing_time = await self.async_rollout_manager.generate_single_sample_async( + sample_data["gen_batch"], sample_data["sample_id"] + ) + end_time = time.time() + + # 组装最终结果 + final_result = { + "sample_id": sample_data["sample_id"], + "agent_loop_output": agent_loop_output, + "processing_time": processing_time, + "timestamp": time.time(), + "param_version": self.current_param_version, + "epoch": sample_data["epoch"], + } + + # 立即放入结果队列 + await self.result_queue.put(final_result) + + async with self.lock: + self.processed_sample_count += 1 + # 更新最大处理时间统计 + if processing_time > self.max_processing_time: + self.max_processing_time = processing_time + + print( + f"[FullyAsyncRollouter] 样本 {sample_data['sample_id']} 处理完成," + f"耗时 {processing_time:.2f}s {end_time - start_time:.2f}s" + ) - Loop Entry Requirements: - 1. Running status validation - 2. Interruption detection - 3. Freshness validation - 4. train_step_samples validation + async def _consumer_worker(self): + """消费者协程,负责从结果队列获取处理结果并放入消息队列""" + while True: + async with self.lock: + if not self.running: + # 如果系统停止但还有结果待处理,继续处理 + if self.result_queue.empty(): + break + + # 从结果队列获取处理结果 + result = await self.result_queue.get() + + # 准备rollout metadata + rollout_metadata = { + "generation_timestamp": result["timestamp"], + "rollout_param_version": result["param_version"], + "processing_time": result["processing_time"], + "epoch": result["epoch"], + "agent_loop_metrics": result["agent_loop_output"].metrics.model_dump(), + } - During Sample Generation Process: - 1. Running status validation - 2. Interruption detection - """ + # 直接将 AgentLoopOutput 放入消息队列 + queue_sample = QueueSample( + data=result["agent_loop_output"], # 直接存储 AgentLoopOutput + rollout_metadata=rollout_metadata, + ) + success = self.message_queue_client.put_sample( + sample=ray.cloudpickle.dumps(queue_sample), + param_version=result["param_version"], + ) + + async with self.lock: + if success: + self.total_generated_samples += 1 + self.train_step_samples += 1 + else: + self.dropped_stale_samples += 1 + + print( + f"[FullyAsyncRollouter] 🔥 消费样本 {result['sample_id']}: " + f"{'成功' if success else '失败'}放入到消息队列, " + f"处理时间 {result['processing_time']:.2f}s" + ) + + # 标记结果队列任务完成 + self.result_queue.task_done() + async def _streaming_generation_main(self): + """流式处理的主入口方法,包含初始化和验证逻辑""" from verl.utils.tracking import Tracking self.logger = Tracking( @@ -254,82 +381,52 @@ def _generation_loop(self): # we start from step 1 self.global_steps += 1 - last_val_metrics = None - self.max_steps_duration = 0 - continuous_iterator = self._create_continuous_iterator() - for epoch, batch_dict in continuous_iterator: - with self.lock: - if not self.running: - break + # 确保async_rollout_manager已经初始化 + if self.async_rollout_manager is None: + self._init_async_rollout_manager() - if self._should_pause_generation(): - self.pause() + # 启动流式处理循环 + """流式样本生成主循环 - 优化版本,确保先完成的样本优先进入队列""" + print(f"[FullyAsyncRollouter] 启动流式处理模式,最大并发样本数: {self.max_concurrent_samples}") - while self.paused and self.running: - print(f"[FullyAsyncRollouter] Generation thread paused, waiting...") - self.condition.wait() + # 初始化异步队列 + self.pending_samples_queue = asyncio.Queue(maxsize=self.max_concurrent_samples) + self.result_queue = asyncio.Queue() - if not self.running: - break + # 启动流式处理协程和消费者协程 + self.feed_task = asyncio.create_task(self._feed_samples()) + self.stream_processor_task = asyncio.create_task(self._submit_worker()) + self.consumer_task = asyncio.create_task(self._consumer_worker()) + # 启动样本添加协程 - metrics = {} - timing_raw = {} - - with self.lock: - batch, gen_batch = self._prepare_generate_batch(batch_dict) - - is_last_step = self.global_steps >= self.total_rollout_steps - - # generate a batch - with self.lock: - start_time = time.time() - with marked_timer("gen", timing_raw, color="red"): - if not self.async_rollout_mode: - gen_batch_output = self.actor_rollout_wg.generate_sequences(gen_batch) - else: - gen_batch_output = self.async_rollout_manager.generate_sequences(gen_batch) - timing_raw.update(gen_batch_output.meta_info["timing"]) - gen_batch_output.meta_info.pop("timing", None) - end_time = time.time() - print(f"[FullyAsyncRollouter] rollout time {end_time - start_time:.2f} seconds") - - if gen_batch_output is not None: - # prepare rollout metadata - rollout_metadata = { - "timing": timing_raw, - "generation_timestamp": time.time(), - "rollout_param_version": self.current_param_version, - } - batch = self._post_generate_batch(batch, gen_batch_output, metrics) - - start_time = time.time() - for sample in batch: - # for sample in samples: - queue_sample = QueueSample( - data=sample, - rollout_metadata=rollout_metadata, - ) - success = self.message_queue_client.put_sample( - sample=ray.cloudpickle.dumps(queue_sample), - param_version=self.current_param_version, - ) - with self.lock: - if success: - self.total_generated_samples += 1 - self.train_step_samples += 1 - else: - self.dropped_stale_samples += 1 - end_time = time.time() - print(f"[FullyAsyncRollouter] mq push time {end_time - start_time:.2f} seconds") + try: + # 等待样本添加完成 + await self.feed_task + print("[FullyAsyncRollouter] 样本添加完成") - self.global_steps += 1 + # 等待流式处理完成 + await self.stream_processor_task + print("[FullyAsyncRollouter] 流式处理完成") - if is_last_step: - pprint(f"[FullyAsyncRollouter] Final validation metrics: {last_val_metrics}") - break + # 等待结果队列清空 + await self.result_queue.join() + print("[FullyAsyncRollouter] 所有结果处理完成") + + except Exception as e: + print(f"[FullyAsyncRollouter] 流式处理异常: {e}") + + finally: + # 取消所有任务 + if self.stream_processor_task: + self.stream_processor_task.cancel() + if self.consumer_task: + self.consumer_task.cancel() + + # 等待任务结束 + await asyncio.gather(self.stream_processor_task, self.consumer_task, return_exceptions=True) - with self.lock: + async with self.lock: self.running = False # 发送终止信号 @@ -338,34 +435,80 @@ def _generation_loop(self): param_version=self.current_param_version, ) - def _monitor_loop(self): + def fit(self): + """Start the async rollouter - entry point that sets up and runs async tasks""" + print("[FullyAsyncRollouter] Starting FullyAsyncRollouter...") + + if self.message_queue_client is None: + raise ValueError("MessageQueue client not set. Call set_message_queue_client() first.") + if self.param_synchronizer is None: + raise ValueError("param_synchronizer client not set. Call set_parameter_synchronizer() first.") + + # Run everything in a single async event loop + asyncio.run(self._async_fit()) + + async def _async_fit(self): + """Main async fit method that coordinates all coroutines""" + # 设置运行状态 + async with self.lock: + self.running = True + self.paused = False + + # 创建主要的异步任务 + generation_task = asyncio.create_task(self._streaming_generation_main()) + monitor_task = asyncio.create_task(self._async_monitor_loop()) + + try: + # 并发运行生成和监控任务 + await asyncio.gather(generation_task, monitor_task, return_exceptions=True) + except Exception as e: + print(f"[FullyAsyncRollouter] 异步任务执行出错: {e}") + finally: + # 清理任务 + if not generation_task.done(): + generation_task.cancel() + if not monitor_task.done(): + monitor_task.cancel() + + # 等待任务完成 + await asyncio.gather(generation_task, monitor_task, return_exceptions=True) + + print("[FullyAsyncRollouter] Rollouter fit completed") + + async def _async_monitor_loop(self): """ + Async coroutine for monitoring: Function 1: Log information output Function 2: Trigger rollout recovery """ last_stats_time = time.time() stats_interval = 30.0 check_interval = 5.0 + while True: - with self.lock: + async with self.lock: if not self.running: break - time.sleep(check_interval) + + await asyncio.sleep(check_interval) + # 定期打印统计信息 current_time = time.time() if current_time - last_stats_time >= stats_interval: - print(f"[FullyAsyncRollouter] {self.get_statistics()}") + stats = await self.get_statistics() + print(f"[FullyAsyncRollouter] {stats}") last_stats_time = current_time - if not self._should_pause_generation(): - self.resume() - def _should_pause_generation(self) -> bool: + if not await self._should_pause_generation(): + await self.resume() + + async def _should_pause_generation(self) -> bool: """Determine whether the build should be paused""" - try: - queue_stats = self.message_queue_client.get_statistics() - queue_size = queue_stats["queue_size"] - current_trainer_version = queue_stats["current_param_version"] + queue_stats = self.message_queue_client.get_statistics() + queue_size = queue_stats["queue_size"] + current_trainer_version = queue_stats["current_param_version"] + async with self.lock: version_diff = self.current_param_version - current_trainer_version if version_diff > self.staleness_threshold: @@ -378,26 +521,27 @@ def _should_pause_generation(self) -> bool: return True if queue_size >= self.max_queue_size: - print(f"[FullyAsyncRollouter] Should pause due to full queue: size={queue_size}, max={self.max_queue_size}") + print( + f"[FullyAsyncRollouter] Should pause due to full queue: " + f"size={queue_size}, max={self.max_queue_size}" + ) return True if self.train_step_samples >= self.max_required_samples: - print(f"[FullyAsyncRollouter] Should pause due to step_generated_samples >= max_required_samples: " - f"self.step_generated_samples={self.train_step_samples}, max={self.max_required_samples}") + print( + f"[FullyAsyncRollouter] Should pause due to step_generated_samples >= max_required_samples: " + f"self.step_generated_samples={self.train_step_samples}, max={self.max_required_samples}" + ) return True return False - except Exception as e: - print(f"[FullyAsyncRollouter] Error checking pause conditions: {e}") - return True - - def pause(self) -> bool: + async def pause(self) -> bool: """pause rollout TODO integrated Partial Rollout """ - print(f"[FullyAsyncRollouter] pause") - with self.lock: + print("[FullyAsyncRollouter] pause") + async with self.lock: if not self.running: return False @@ -407,12 +551,12 @@ def pause(self) -> bool: self.paused = True return True - def resume(self) -> bool: + async def resume(self) -> bool: """resume rollout TODO integrated Partial Rollout """ - print(f"[FullyAsyncRollouter] resume") - with self.lock: + print("[FullyAsyncRollouter] resume") + async with self.lock: if not self.running: return False @@ -423,8 +567,8 @@ def resume(self) -> bool: self.condition.notify_all() return True - def get_statistics(self) -> dict: - with self.lock: + async def get_statistics(self) -> dict: + async with self.lock: queue_stats = self.message_queue_client.get_statistics() stats = { "is_running": self.running, @@ -432,7 +576,12 @@ def get_statistics(self) -> dict: "train_step_samples": self.train_step_samples, "dropped_stale_samples": self.dropped_stale_samples, "current_param_version": self.current_param_version, - "queue_size": queue_stats['queue_size'], + "queue_size": queue_stats["queue_size"], "queue_max_size": self.max_queue_size, + "max_concurrent_samples": self.max_concurrent_samples, + "max_processing_time": self.max_processing_time, + "pending_samples_queue_size": self.pending_samples_queue.qsize(), + "result_queue_size": self.result_queue.qsize(), } + return stats diff --git a/recipe/fully_async_policy/fully_async_trainer.py b/recipe/fully_async_policy/fully_async_trainer.py index 4ee8fa52332..d9883aaf33f 100644 --- a/recipe/fully_async_policy/fully_async_trainer.py +++ b/recipe/fully_async_policy/fully_async_trainer.py @@ -44,16 +44,16 @@ class FullyAsyncTrainer(RayPPOTrainer): """ def __init__( - self, - config, - tokenizer, - role_worker_mapping: dict[Role, WorkerType], - resource_pool_manager: ResourcePoolManager, - ray_worker_group_cls: RayWorkerGroup = RayWorkerGroup, - processor=None, - reward_fn=None, - val_reward_fn=None, - device_name=None, + self, + config, + tokenizer, + role_worker_mapping: dict[Role, WorkerType], + resource_pool_manager: ResourcePoolManager, + ray_worker_group_cls: RayWorkerGroup = RayWorkerGroup, + processor=None, + reward_fn=None, + val_reward_fn=None, + device_name=None, ): # Store the tokenizer for text processing self.tokenizer = tokenizer @@ -117,6 +117,7 @@ def get_actor_wg(self): def _get_samples_from_queue(self) -> tuple[None, None] | tuple[int, Any]: """ Get samples from message queue and compose gen_batch_output + Uses a loop to continuously collect samples until enough are gathered Returns: tuple: (epoch, batch_dict, gen_batch_output) @@ -132,19 +133,39 @@ def _get_samples_from_queue(self) -> tuple[None, None] | tuple[int, Any]: flush=True, ) - # Get samples from queue + # Collect samples using a simple loop calling get_sample consumer_start = time.time() - queue_samples, queue_len = self.message_queue_client.get_samples(min_batch_count=required_samples) + queue_samples = [] + + print(f"[FullyAsyncTrainer] Starting sample collection loop, required={required_samples}") + + while len(queue_samples) < required_samples: + # 获取单个样本,会一直等待直到有样本或收到None + sample = self.message_queue_client.get_sample() + + if sample is None: + # 检测到结束信号(None),立即退出 + logger.info( + f"Detected termination signal (None), stopping sample collection. " + f"Collected {len(queue_samples)}/{required_samples} samples" + ) + break + + queue_samples.append(sample) + + if len(queue_samples) % 10 == 0 or len(queue_samples) >= required_samples: + print(f"[FullyAsyncTrainer] Collected {len(queue_samples)}/{required_samples} samples") + consumer_end = time.time() - if not queue_samples or len(queue_samples) == 0: - logger.warning("required_samples is empty") + if not queue_samples or len(queue_samples) < required_samples: + logger.warning("not enough samples collected after loop") return None, None - print(f"[FullyAsyncTrainer] Retrieved {len(queue_samples)} samples from queue. " - f"wait time {consumer_end - consumer_start:.2f} seconds. " - f"queue len {queue_len}. " - ) + print( + f"[FullyAsyncTrainer] Loop collection completed: {len(queue_samples)}/{required_samples} samples, " + f"total wait time: {consumer_end - consumer_start:.2f} seconds" + ) queue_samples = [ray.cloudpickle.loads(x) for x in queue_samples] # Assemble batch @@ -154,12 +175,10 @@ def _get_samples_from_queue(self) -> tuple[None, None] | tuple[int, Any]: def _assemble_gen_batch_output_from_queue_samples(self, queue_samples: list[QueueSample]): """ - Assemble gen_batch_output from queue samples + Assemble gen_batch_output from queue samples containing AgentLoopOutput Args: - queue_samples: List of samples from queue - n_responses_per_prompt: Number of responses per prompt - batch_size: Batch size + queue_samples: List of samples from queue, each containing AgentLoopOutput Returns: DataProto: Assembled gen_batch_output @@ -168,23 +187,29 @@ def _assemble_gen_batch_output_from_queue_samples(self, queue_samples: list[Queu import numpy as np - from verl.protocol import DataProto - if not queue_samples: raise ValueError("Empty queue_samples provided for batch assembly") - print(f"[FullyAsyncTrainer] Assembling batch from {len(queue_samples)} queue samples") + print(f"[FullyAsyncTrainer] Assembling batch from {len(queue_samples)} queue samples with AgentLoopOutput") - # Extract data and metadata from all samples - sample_data_list = [] + # Extract AgentLoopOutput and metadata from all samples + agent_loop_outputs = [] rollout_metadata_list = [] - timing_info = {} + processing_times = [] - for i, sample in enumerate(queue_samples): - sample_data_list.append(sample.data) + for sample in queue_samples: + # sample.data is now AgentLoopOutput + agent_loop_outputs.append(sample.data) rollout_metadata_list.append(sample.rollout_metadata) + processing_times.append(sample.rollout_metadata.get("processing_time", 0)) + + # Use the static method to postprocess AgentLoopOutput list into DataProto + from verl.experimental.agent_loop.agent_loop import AgentLoopWorker - batch = DataProto.from_items(sample_data_list) + batch = AgentLoopWorker.postprocess_agent_loop_outputs(agent_loop_outputs, self.tokenizer, self.config) + + # Apply _post_generate_batch logic here + batch = self._post_generate_batch_for_agent_outputs(batch, agent_loop_outputs) # Collect timing information and metadata param_versions = [] @@ -193,21 +218,10 @@ def _assemble_gen_batch_output_from_queue_samples(self, queue_samples: list[Queu # Extract parameter version and timestamp param_versions.append(metadata.get("rollout_param_version", 0)) sample_timestamps.append(metadata.get("generation_timestamp", time.time())) - if "timing" in metadata: - for timing_key, timing_value in metadata["timing"].items(): - if timing_key not in timing_info: - timing_info[timing_key] = [] - # if isinstance(timing_value, (int, float)): - # timing_info[timing_key].append(timing_value) - # Calculate average timing - avg_timing = {} - for key, values in timing_info.items(): - if values and len(values) > 0: - avg_timing[key] = sum(values) / len(values) # Create meta_info meta_info = { - "timing": avg_timing, + "timing": {"avg_processing_time": np.mean(processing_times) if processing_times else 0}, "queue_sample_count": len(queue_samples), "rollout_param_versions": param_versions, "sample_timestamps": sample_timestamps, @@ -215,8 +229,47 @@ def _assemble_gen_batch_output_from_queue_samples(self, queue_samples: list[Queu "avg_sample_age": np.mean([time.time() - ts for ts in sample_timestamps]), } + batch.meta_info.update(meta_info) + end_time = time.time() - print(f"[FullyAsyncTrainer] {meta_info} time elapsed: {end_time - start_time:.2f} seconds") + print( + f"[FullyAsyncTrainer] Assembled batch with meta_info: " + f"{meta_info}, time elapsed: {end_time - start_time:.2f} seconds" + ) + + return batch + + def _post_generate_batch_for_agent_outputs(self, batch, agent_loop_outputs): + """ + Apply _post_generate_batch logic for AgentLoopOutput + + Args: + batch: DataProto created from AgentLoopWorker.postprocess_agent_loop_outputs + agent_loop_outputs: List of AgentLoopOutput + + Returns: + DataProto: Processed batch with additional metadata + """ + import uuid + + import numpy as np + import torch + + from verl.trainer.ppo.ray_trainer import compute_response_mask + + # Add UIDs + batch.non_tensor_batch["uid"] = np.array([str(uuid.uuid4()) for _ in range(len(batch.batch))], dtype=object) + + # response_mask should already be in batch from AgentLoopWorker.postprocess_agent_loop_outputs + if "response_mask" not in batch.batch.keys(): + batch.batch["response_mask"] = compute_response_mask(batch) + + # Balance the number of valid tokens across DP ranks if needed + if self.config.trainer.balance_batch: + self._balance_batch(batch, metrics={}) + + # compute global_valid tokens + batch.meta_info["global_token_num"] = torch.sum(batch.batch["attention_mask"], dim=-1).tolist() return batch @@ -293,7 +346,7 @@ def fit(self): if batch is None: break - # 更新统计信息 + # 更新统计信息 self.processed_samples += len(batch) if isinstance(batch, list) else 1 # 从meta_info中获取参数版本信息 diff --git a/recipe/fully_async_policy/message_queue.py b/recipe/fully_async_policy/message_queue.py index e86c006106e..4d1eddee6ae 100644 --- a/recipe/fully_async_policy/message_queue.py +++ b/recipe/fully_async_policy/message_queue.py @@ -138,6 +138,26 @@ def get_samples(self, min_batch_count: int = 1) -> tuple[list[Any], int]: self.total_consumed += len(samples) return samples, len(self.queue) + def get_sample(self) -> Any | None: + """ + Get a single sample from the queue, wait until one is available + + Returns: + Any: Single sample data or None if queue is closed + """ + with self.lock: + while len(self.queue) == 0 and self.running: + self.consumer_condition.wait() + + # If queue is closed and empty, return None + if not self.running and len(self.queue) == 0: + return None + + # Get one sample + data = self.queue.popleft() + self.total_consumed += 1 + return data + def update_param_version(self, version: int): """Update current parameter version""" with self.lock: @@ -221,6 +241,10 @@ def get_samples(self, min_batch_count: int = 1) -> tuple[list[Any], int]: """Get batch from queue, wait until enough samples are available""" return ray.get(self.queue_actor.get_samples.remote(min_batch_count)) + def get_sample(self) -> Any | None: + """Get single sample from queue, wait until one is available""" + return ray.get(self.queue_actor.get_sample.remote()) + def update_param_version(self, version: int): """Update parameter version""" ray.get(self.queue_actor.update_param_version.remote(version)) diff --git a/recipe/fully_async_policy/simple_streaming_demo.py b/recipe/fully_async_policy/simple_streaming_demo.py new file mode 100644 index 00000000000..d3ae0702e3f --- /dev/null +++ b/recipe/fully_async_policy/simple_streaming_demo.py @@ -0,0 +1,176 @@ +# Copyright 2025 Meituan Ltd. and/or its affiliates +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import asyncio +import random +import time + + +class SimpleStreamingSystem: + """简化的流式处理系统演示""" + + def __init__(self, max_concurrent_tasks: int = 4): + self.max_concurrent_tasks = max_concurrent_tasks + self.data_queue = asyncio.Queue() + self.result_queue = asyncio.Queue() + self.consumer_count = 0 + + # 数据流协程 + async def data_stream(self): + # 添加初始数据 + # 准备测试数据 + test_data = [{"id": f"task_{i}", "content": f"数据_{i}"} for i in range(8)] + await self.add_data_stream(test_data) + + # 模拟后续数据流 + await asyncio.sleep(3) + print("\n添加第二批数据...") + extra_data = [{"id": f"extra_{i}", "content": f"额外数据_{i}"} for i in range(5)] + await self.add_data_stream(extra_data) + + # 发送结束信号 + await asyncio.sleep(1) + await self.data_queue.put("DONE") + print("发送结束信号") + + async def add_data_stream(self, data_list: list[dict]): + """模拟数据流""" + print("开始添加数据流...") + + for i, data_item in enumerate(data_list): + await self.data_queue.put(data_item) + print(f"数据 {data_item['id']} 进入待处理队列") + + # 模拟数据流的间隔 + if i < len(data_list) - 1: # 最后一个不等待 + await asyncio.sleep(0.8) + + print("初始数据流添加完成") + + async def _process_data_async(self, data_item: dict): + """异步处理单个数据项""" + data_id = data_item["id"] + content = data_item["content"] + + # 模拟不同的处理时间(1-3秒) + processing_time = random.uniform(1, 3) + + print(f" 开始处理 {data_id},预计耗时 {processing_time:.1f}s") + + # 异步等待处理完成 + await asyncio.sleep(processing_time) + + result = { + "id": data_id, + "processed_content": f"处理后的{content}", + "processing_time": round(processing_time, 2), + "completed_at": time.time(), + } + + # 立即放入结果队列 + await self.result_queue.put(result) + print(f" {data_id} 处理完成!(耗时 {processing_time:.1f}s) -> 进入结果队列") + + async def _submit_worker(self): + """流式提交工作协程""" + active_tasks = set() + + print("流式提交器启动...") + + while True: + # 获取待处理数据 + data_item = await self.data_queue.get() + + if data_item == "DONE": + print("收到结束信号,等待剩余任务完成...") + if active_tasks: + await asyncio.gather(*active_tasks, return_exceptions=True) + break + + # 检查并发数限制 + while len(active_tasks) >= self.max_concurrent_tasks: + print(f"达到最大并发数 {self.max_concurrent_tasks},等待任务完成...") + done_tasks, active_tasks = await asyncio.wait(active_tasks, return_when=asyncio.FIRST_COMPLETED) + + # 清理完成的任务 + for task in done_tasks: + try: + await task + print(f"task 完成 {task}") + except Exception as e: + print(f"任务执行失败: {e}") + + # 立即提交新任务 + task = asyncio.create_task(self._process_data_async(data_item), name=f"active {data_item}") + active_tasks.add(task) + + print(f"提交任务 {data_item['id']},当前并发数: {len(active_tasks)}") + + async def _consumer_worker(self): + """结果消费协程""" + print("消费者启动...") + + while True: + try: + # 从结果队列获取处理结果 + result = await asyncio.wait_for(self.result_queue.get(), timeout=2.0) + + self.consumer_count += 1 + + print( + f"消费 #{self.consumer_count}: {result['id']} " + f"(处理时间 {result['processing_time']}s) - {result['processed_content']}" + ) + + except asyncio.TimeoutError: + print(" 消费者等待中...") + await asyncio.sleep(0.5) + + async def run_demo(self): + """运行演示""" + print("=" * 60) + print(f"最大并发数: {self.max_concurrent_tasks}") + print("=" * 60) + + # 启动核心协程 + stream_task = asyncio.create_task(self.data_stream()) + submit_task = asyncio.create_task(self._submit_worker()) + consumer_task = asyncio.create_task(self._consumer_worker()) + + try: + # 等待数据流完成 + await stream_task + print("数据流完成") + + # 等待处理完成 + await submit_task + print("所有任务处理完成") + + finally: + # 清理 + submit_task.cancel() + consumer_task.cancel() + await asyncio.gather(submit_task, consumer_task, return_exceptions=True) + + print(f"\n最终统计: 消费了 {self.consumer_count} 个结果") + + +async def main(): + """主函数""" + system = SimpleStreamingSystem(max_concurrent_tasks=3) + await system.run_demo() + + +if __name__ == "__main__": + asyncio.run(main()) diff --git a/recipe/one_step_off_policy/ray_trainer.py b/recipe/one_step_off_policy/ray_trainer.py index 893760965d0..ef8d6d8792e 100644 --- a/recipe/one_step_off_policy/ray_trainer.py +++ b/recipe/one_step_off_policy/ray_trainer.py @@ -76,20 +76,20 @@ class OneStepOffRayTrainer(RayPPOTrainer): # TODO: support each role have individual ray_worker_group_cls, # i.e., support different backend of different role def __init__( - self, - config, - tokenizer, - role_worker_mapping: dict[Role, WorkerType], - resource_pool_manager: ResourcePoolManager, - ray_worker_group_cls: RayWorkerGroup = RayWorkerGroup, - processor=None, - reward_fn=None, - val_reward_fn=None, - train_dataset: Dataset | None = None, - val_dataset: Dataset | None = None, - collate_fn=None, - train_sampler: Sampler | None = None, - device_name=None, + self, + config, + tokenizer, + role_worker_mapping: dict[Role, WorkerType], + resource_pool_manager: ResourcePoolManager, + ray_worker_group_cls: RayWorkerGroup = RayWorkerGroup, + processor=None, + reward_fn=None, + val_reward_fn=None, + train_dataset: Dataset | None = None, + val_dataset: Dataset | None = None, + collate_fn=None, + train_sampler: Sampler | None = None, + device_name=None, ): """ Initialize distributed PPO trainer with Ray backend. diff --git a/verl/experimental/agent_loop/agent_loop.py b/verl/experimental/agent_loop/agent_loop.py index ef86381020b..34f4c78833c 100644 --- a/verl/experimental/agent_loop/agent_loop.py +++ b/verl/experimental/agent_loop/agent_loop.py @@ -16,6 +16,7 @@ import logging import os import random +import time from abc import ABC, abstractmethod from typing import Any @@ -200,6 +201,81 @@ def decorator(subclass: type[AgentLoopBase]) -> type[AgentLoopBase]: return decorator +def postprocess_agent_loop_outputs(inputs: list[AgentLoopOutput], tokenizer, config) -> DataProto: + """Static method to postprocess a list of AgentLoopOutput into DataProto + + Args: + inputs: List of AgentLoopOutput + tokenizer: Tokenizer instance + config: Configuration object + + Returns: + DataProto: Processed batch data + """ + # NOTE: consistent with batch version of generate_sequences in vllm_rollout_spmd.py + # prompts: left pad + # responses: right pad + # input_ids: prompt + response + # attention_mask: [0,0,0,0,1,1,1,1, | 1,1,1,0,0,0,0,0] + # position_ids: [0,0,0,0,0,1,2,3, | 4,5,6,7,8,9,10,11] + + # prompts + tokenizer.padding_side = "left" + outputs = tokenizer.pad( + [{"input_ids": input.prompt_ids} for input in inputs], + padding="max_length", + max_length=config.actor_rollout_ref.rollout.prompt_length, + return_tensors="pt", + return_attention_mask=True, + ) + prompt_ids, prompt_attention_mask = outputs["input_ids"], outputs["attention_mask"] + + # responses + tokenizer.padding_side = "right" + outputs = tokenizer.pad( + [{"input_ids": input.response_ids} for input in inputs], + padding="max_length", + max_length=config.actor_rollout_ref.rollout.response_length, + return_tensors="pt", + return_attention_mask=True, + ) + response_ids, response_attention_mask = outputs["input_ids"], outputs["attention_mask"] + + # response_mask + outputs = tokenizer.pad( + [{"input_ids": input.response_mask} for input in inputs], + padding="max_length", + max_length=config.actor_rollout_ref.rollout.response_length, + return_tensors="pt", + return_attention_mask=False, + ) + response_mask = outputs["input_ids"] + assert response_ids.shape == response_mask.shape, ( + f"mismatch in response_ids and response_mask shape: {response_ids.shape} vs {response_mask.shape}" + ) + response_mask = response_mask * response_attention_mask + + input_ids = torch.cat([prompt_ids, response_ids], dim=1) + attention_mask = torch.cat([prompt_attention_mask, response_attention_mask], dim=1) + position_ids = (attention_mask.cumsum(dim=1) - 1) * attention_mask + + batch = TensorDict( + { + "prompts": prompt_ids, # [bsz, prompt_length] + "responses": response_ids, # [bsz, response_length] + "response_mask": response_mask, # [bsz, response_length] + "input_ids": input_ids, # [bsz, prompt_length + response_length] + "attention_mask": attention_mask, # [bsz, prompt_length + response_length] + "position_ids": position_ids, # [bsz, prompt_length + response_length] + }, + batch_size=len(input_ids), + ) + + num_turns = np.array([input.num_turns for input in inputs], dtype=np.int32) + metrics = [input.metrics.model_dump() for input in inputs] + return DataProto(batch=batch, non_tensor_batch={"__num_turns__": num_turns}, meta_info={"metrics": metrics}) + + @ray.remote class AgentLoopWorker: """Agent loop worker takes a batch of messages and run each message in an agent loop.""" @@ -289,9 +365,60 @@ async def generate_sequences(self, batch: DataProto) -> DataProto: ) outputs = await asyncio.gather(*tasks) - output = self._postprocess(outputs) + output = postprocess_agent_loop_outputs(outputs, self.tokenizer, self.config) return output + async def generate_sequences_no_post(self, batch: DataProto) -> list[AgentLoopOutput]: + """Generate sequences from agent loop. + + Args: + batch (DataProto): Input batch. + + Returns: + list[AgentLoopOutput]: List of agent loop outputs, one per sample in the batch. + Each AgentLoopOutput contains: + - prompt_ids: prompt token ids + - response_ids: response token ids including LLM generated and tool response tokens + - response_mask: 1 for LLM generated tokens, 0 for tool response tokens + - num_turns: number of chat turns + - metrics: performance metrics + """ + config = self.config.actor_rollout_ref.rollout + sampling_params = dict( + temperature=config.temperature, + top_p=config.top_p, + repetition_penalty=1.0, + ) + + # override sampling params for validation + if batch.meta_info.get("validate", False): + sampling_params["top_p"] = config.val_kwargs.top_p + sampling_params["temperature"] = config.val_kwargs.temperature + + # by default, we assume it's a single turn agent + if "agent_name" not in batch.non_tensor_batch: + batch.non_tensor_batch["agent_name"] = np.array(["single_turn_agent"] * len(batch), dtype=object) + + tasks = [] + agent_names = batch.non_tensor_batch["agent_name"] + raw_prompts = batch.non_tensor_batch["raw_prompt"] + if "index" in batch.non_tensor_batch: + index = batch.non_tensor_batch["index"] + else: + index = np.arange(len(raw_prompts)) + + trajectory_info = await get_trajectory_info( + batch.meta_info.get("global_steps", -1), index, batch.meta_info.get("validate", False) + ) + + for agent_name, messages, trajectory in zip(agent_names, raw_prompts, trajectory_info, strict=True): + tasks.append( + asyncio.create_task(self._run_agent_loop(agent_name, messages.tolist(), sampling_params, trajectory)) + ) + outputs = await asyncio.gather(*tasks) + + return outputs + async def _run_agent_loop( self, agent_name: str, @@ -320,70 +447,6 @@ async def _run_agent_loop( output = await agent_loop.run(messages, sampling_params) return output - def _postprocess(self, inputs: list[AgentLoopOutput]) -> DataProto: - # NOTE: consistent with batch version of generate_sequences in vllm_rollout_spmd.py - # prompts: left pad - # responses: right pad - # input_ids: prompt + response - # attention_mask: [0,0,0,0,1,1,1,1, | 1,1,1,0,0,0,0,0] - # position_ids: [0,0,0,0,0,1,2,3, | 4,5,6,7,8,9,10,11] - - # prompts - self.tokenizer.padding_side = "left" - outputs = self.tokenizer.pad( - [{"input_ids": input.prompt_ids} for input in inputs], - padding="max_length", - max_length=self.config.actor_rollout_ref.rollout.prompt_length, - return_tensors="pt", - return_attention_mask=True, - ) - prompt_ids, prompt_attention_mask = outputs["input_ids"], outputs["attention_mask"] - - # responses - self.tokenizer.padding_side = "right" - outputs = self.tokenizer.pad( - [{"input_ids": input.response_ids} for input in inputs], - padding="max_length", - max_length=self.config.actor_rollout_ref.rollout.response_length, - return_tensors="pt", - return_attention_mask=True, - ) - response_ids, response_attention_mask = outputs["input_ids"], outputs["attention_mask"] - - # response_mask - outputs = self.tokenizer.pad( - [{"input_ids": input.response_mask} for input in inputs], - padding="max_length", - max_length=self.config.actor_rollout_ref.rollout.response_length, - return_tensors="pt", - return_attention_mask=False, - ) - response_mask = outputs["input_ids"] - assert response_ids.shape == response_mask.shape, ( - f"mismatch in response_ids and response_mask shape: {response_ids.shape} vs {response_mask.shape}" - ) - response_mask = response_mask * response_attention_mask - - input_ids = torch.cat([prompt_ids, response_ids], dim=1) - attention_mask = torch.cat([prompt_attention_mask, response_attention_mask], dim=1) - position_ids = (attention_mask.cumsum(dim=1) - 1) * attention_mask - - batch = TensorDict( - { - "prompts": prompt_ids, # [bsz, prompt_length] - "responses": response_ids, # [bsz, response_length] - "response_mask": response_mask, # [bsz, response_length] - "input_ids": input_ids, # [bsz, prompt_length + response_length] - "attention_mask": attention_mask, # [bsz, prompt_length + response_length] - "position_ids": position_ids, # [bsz, prompt_length + response_length] - }, - batch_size=len(input_ids), - ) - - num_turns = np.array([input.num_turns for input in inputs], dtype=np.int32) - metrics = [input.metrics.model_dump() for input in inputs] - return DataProto(batch=batch, non_tensor_batch={"__num_turns__": num_turns}, meta_info={"metrics": metrics}) - async def get_trajectory_info(step, index, validate): """Get trajectory info. @@ -407,6 +470,18 @@ async def get_trajectory_info(step, index, validate): return trajectory_info +async def _ray_future_to_asyncio(ray_future): + """将Ray future转换为asyncio可等待的对象""" + while True: + try: + # 非阻塞检查Ray future是否完成 + result = ray.get(ray_future, timeout=0.001) # 1ms timeout + return result + except ray.exceptions.GetTimeoutError: + # 未完成,让出控制权给其他协程 + await asyncio.sleep(1) # 1s sleep + + class AgentLoopManager: """Agent loop manager that manages a group of agent loop workers.""" @@ -512,6 +587,46 @@ def generate_sequences(self, prompts: DataProto) -> DataProto: output.meta_info = {"timing": timing} return output + async def generate_single_sample_async(self, sample: DataProto, sample_id: str) -> tuple[AgentLoopOutput, float]: + """ + 异步处理单个样本 - 用于流式推理的核心方法 + + Args: + sample: 单个样本数据 + sample_id: 样本ID + + Returns: + tuple[AgentLoopOutput, float]: 处理结果和处理时间 + """ + start_time = time.time() + + # 使用负载均衡选择 worker + worker = self._select_best_worker() + + # 异步处理单个样本 + output_future = worker.generate_sequences.remote(sample) + outputs = await _ray_future_to_asyncio(output_future) + + processing_time = time.time() - start_time + + # outputs 是 AgentLoopOutput 列表,取第一个(因为是单样本) + assert len(outputs) == 1, f"Expected single output for single sample, got {len(outputs)}" + output = outputs[0] + + # 添加处理时间到metrics + output.metrics.generate_sequences = processing_time + + return output, processing_time + + def _select_best_worker(self): + """选择最佳的 worker(简单的轮询负载均衡)""" + if not hasattr(self, "_worker_index"): + self._worker_index = 0 + + worker = self.agent_loop_workers[self._worker_index] + self._worker_index = (self._worker_index + 1) % len(self.agent_loop_workers) + return worker + def _performance_metrics(self, metrics: list[list[dict[str, str]]], output: DataProto) -> dict[str, float]: timing = {} t_generate_sequences = np.array([metric["generate_sequences"] for chunk in metrics for metric in chunk]) diff --git a/verl/trainer/main_ppo.py b/verl/trainer/main_ppo.py index e81d0b32c1d..fa12105f07f 100644 --- a/verl/trainer/main_ppo.py +++ b/verl/trainer/main_ppo.py @@ -41,7 +41,7 @@ def main(config): # Define a function to run the PPO-like training process -def run_ppo(config, task_runner_class = None) -> None: +def run_ppo(config, task_runner_class=None) -> None: """Initialize Ray cluster and run distributed PPO training process. Args: diff --git a/verl/trainer/ppo/ray_trainer.py b/verl/trainer/ppo/ray_trainer.py index e8398fd0865..26150cc631d 100644 --- a/verl/trainer/ppo/ray_trainer.py +++ b/verl/trainer/ppo/ray_trainer.py @@ -1247,7 +1247,6 @@ def _process_batch_common(self, batch, metrics, timing_raw): reward_tensor, reward_extra_infos_dict = compute_reward(batch, self.reward_fn) # recompute old_log_probs with marked_timer("old_log_prob", timing_raw, color="blue"): - old_log_prob = self.actor_rollout_wg.compute_log_prob(batch) entropys = old_log_prob.batch["entropys"] response_masks = batch.batch["response_mask"] From 0d7233f648a592aa7c352d65fd1e471d00a887ff Mon Sep 17 00:00:00 2001 From: ArronHZG Date: Thu, 14 Aug 2025 11:32:35 +0800 Subject: [PATCH 043/182] async mq --- .../fully_async_rollouter.py | 2 +- recipe/fully_async_policy/message_queue.py | 166 ++++--- .../unittest/ray_async_resource_config.py | 366 ++++++++++++++++ .../{ => unittest}/simple_streaming_demo.py | 0 .../unittest/test_asyncio_message_queue.py | 407 ++++++++++++++++++ verl/experimental/agent_loop/agent_loop.py | 27 +- 6 files changed, 891 insertions(+), 77 deletions(-) create mode 100644 recipe/fully_async_policy/unittest/ray_async_resource_config.py rename recipe/fully_async_policy/{ => unittest}/simple_streaming_demo.py (100%) create mode 100644 recipe/fully_async_policy/unittest/test_asyncio_message_queue.py diff --git a/recipe/fully_async_policy/fully_async_rollouter.py b/recipe/fully_async_policy/fully_async_rollouter.py index ae74cc838b1..0060cfa1b02 100644 --- a/recipe/fully_async_policy/fully_async_rollouter.py +++ b/recipe/fully_async_policy/fully_async_rollouter.py @@ -24,7 +24,7 @@ from verl.utils.tracking import ValidationGenerationsLogger -@ray.remote(num_cpus=10, max_concurrency=10) +@ray.remote(num_cpus=10, max_concurrency=100) class FullyAsyncRollouter(RayPPOTrainer): """ Asynchronous sample generator, responsible for continuously generating training samples diff --git a/recipe/fully_async_policy/message_queue.py b/recipe/fully_async_policy/message_queue.py index 4d1eddee6ae..2e8ad6b0e79 100644 --- a/recipe/fully_async_policy/message_queue.py +++ b/recipe/fully_async_policy/message_queue.py @@ -12,8 +12,8 @@ # See the License for the specific language governing permissions and # limitations under the License. +import asyncio import logging -import threading from collections import deque from dataclasses import dataclass from typing import Any @@ -30,10 +30,11 @@ class QueueSample: rollout_metadata: dict[str, Any] -@ray.remote(num_cpus=10, max_concurrency=10) +@ray.remote(num_cpus=2, max_concurrency=20) class MessageQueue: """ Simplified Ray-based asynchronous message queue for communication between Rollouter and Trainer + 使用 asyncio 实现异步消息队列 """ def __init__(self, config: DictConfig, max_queue_size: int = 1000): @@ -50,12 +51,12 @@ def __init__(self, config: DictConfig, max_queue_size: int = 1000): except (AttributeError, RecursionError): self.staleness_threshold = 3 - # Threading for message handling + # Asyncio for message handling self.running = True - # thread safe - self.lock = threading.RLock() - self.consumer_condition = threading.Condition(self.lock) + # async safe - 在第一次使用时初始化 + self._lock = None + self._consumer_condition = None # statistic message self.total_produced = 0 @@ -67,7 +68,13 @@ def __init__(self, config: DictConfig, max_queue_size: int = 1000): f"staleness_threshold={self.staleness_threshold}" ) - def put_sample(self, sample: Any, param_version: int) -> bool: + async def _ensure_async_primitives(self): + """确保异步原语已初始化""" + if self._lock is None: + self._lock = asyncio.Lock() + self._consumer_condition = asyncio.Condition(self._lock) + + async def put_sample(self, sample: Any, param_version: int) -> bool: """ Put a batch sample into the queue @@ -78,7 +85,9 @@ def put_sample(self, sample: Any, param_version: int) -> bool: Returns: bool: Whether the sample was successfully put into the queue """ - with self.lock: + await self._ensure_async_primitives() + + async with self._lock: # Check freshness staleness = self.current_param_version - param_version if staleness > self.staleness_threshold: @@ -95,14 +104,14 @@ def put_sample(self, sample: Any, param_version: int) -> bool: self.total_produced += 1 # Notify waiting consumers - self.consumer_condition.notify() + self._consumer_condition.notify() if self.total_produced % 100 == 0: logger.debug(f"MessageQueue stats: produced={self.total_produced}, queue_size={len(self.queue)}") return True - def get_samples(self, min_batch_count: int = 1) -> tuple[list[Any], int]: + async def get_samples(self, min_batch_count: int = 1) -> tuple[list[Any], int]: """ Get batch samples from the queue, wait until enough samples are available @@ -112,13 +121,14 @@ def get_samples(self, min_batch_count: int = 1) -> tuple[list[Any], int]: Returns: List[Any]: List of retrieved samples """ + await self._ensure_async_primitives() - with self.lock: + async with self._lock: while len(self.queue) < min_batch_count and self.running: print(f"[MessageQueue] consumer_condition {len(self.queue)}") if len(self.queue) > 0 and self.queue[-1] is None: return [], len(self.queue) - self.consumer_condition.wait() + await self._consumer_condition.wait() # If queue is closed and doesn't have enough samples, return empty list if not self.running and len(self.queue) < min_batch_count: @@ -138,16 +148,18 @@ def get_samples(self, min_batch_count: int = 1) -> tuple[list[Any], int]: self.total_consumed += len(samples) return samples, len(self.queue) - def get_sample(self) -> Any | None: + async def get_sample(self) -> Any | None: """ Get a single sample from the queue, wait until one is available Returns: Any: Single sample data or None if queue is closed """ - with self.lock: + await self._ensure_async_primitives() + + async with self._lock: while len(self.queue) == 0 and self.running: - self.consumer_condition.wait() + await self._consumer_condition.wait() # If queue is closed and empty, return None if not self.running and len(self.queue) == 0: @@ -158,21 +170,27 @@ def get_sample(self) -> Any | None: self.total_consumed += 1 return data - def update_param_version(self, version: int): + async def update_param_version(self, version: int): """Update current parameter version""" - with self.lock: + await self._ensure_async_primitives() + + async with self._lock: old_version = self.current_param_version self.current_param_version = version logger.debug(f"Parameter version updated from {old_version} to {version}") - def get_queue_size(self) -> int: + async def get_queue_size(self) -> int: """Get current queue length""" - with self.lock: + await self._ensure_async_primitives() + + async with self._lock: return len(self.queue) - def get_statistics(self) -> dict[str, Any]: + async def get_statistics(self) -> dict[str, Any]: """Get queue statistics""" - with self.lock: + await self._ensure_async_primitives() + + async with self._lock: return { "queue_size": len(self.queue), "total_produced": self.total_produced, @@ -183,24 +201,30 @@ def get_statistics(self) -> dict[str, Any]: "max_queue_size": self.max_queue_size, } - def clear_queue(self): + async def clear_queue(self): """Clear the queue""" - with self.lock: + await self._ensure_async_primitives() + + async with self._lock: cleared_count = len(self.queue) self.queue.clear() logger.info(f"Cleared {cleared_count} samples from queue") - def shutdown(self): + async def shutdown(self): """Shutdown the message queue""" - with self.lock: + await self._ensure_async_primitives() + + async with self._lock: self.running = False - # Notify all waiting threads so they can exit - self.consumer_condition.notify_all() + # Notify all waiting coroutines so they can exit + self._consumer_condition.notify_all() logger.info("MessageQueue shutdown") - def get_memory_usage(self) -> dict: + async def get_memory_usage(self) -> dict: """Get memory usage statistics""" - with self.lock: + await self._ensure_async_primitives() + + async with self._lock: # Estimate memory usage of samples in queue import sys @@ -228,43 +252,65 @@ def get_memory_usage(self) -> dict: class MessageQueueClient: - """MessageQueue client for communicating with MessageQueue Actor""" + """Asyncio-compatible MessageQueue client for communicating with MessageQueue Actor""" def __init__(self, queue_actor: Any): self.queue_actor = queue_actor - def put_sample(self, sample: Any, param_version: int) -> bool: - """Put batch into queue""" + async def put_sample(self, sample: Any, param_version: int) -> bool: + """Put batch into queue (async)""" + future = self.queue_actor.put_sample.remote(sample, param_version) + return await asyncio.wrap_future(future.future()) + + async def get_samples(self, min_batch_count: int = 1) -> tuple[list[Any], int]: + """Get batch from queue, wait until enough samples are available (async)""" + future = self.queue_actor.get_samples.remote(min_batch_count) + return await asyncio.wrap_future(future.future()) + + async def get_sample(self) -> Any | None: + """Get single sample from queue, wait until one is available (async)""" + future = self.queue_actor.get_sample.remote() + return await asyncio.wrap_future(future.future()) + + async def update_param_version(self, version: int): + """Update parameter version (async)""" + future = self.queue_actor.update_param_version.remote(version) + await asyncio.wrap_future(future.future()) + + async def get_queue_size(self) -> int: + """Get queue size (async)""" + future = self.queue_actor.get_queue_size.remote() + return await asyncio.wrap_future(future.future()) + + async def get_statistics(self) -> dict[str, Any]: + """Get statistics (async)""" + future = self.queue_actor.get_statistics.remote() + return await asyncio.wrap_future(future.future()) + + async def clear_queue(self): + """Clear queue (async)""" + future = self.queue_actor.clear_queue.remote() + await asyncio.wrap_future(future.future()) + + async def shutdown(self): + """Shutdown queue (async)""" + future = self.queue_actor.shutdown.remote() + await asyncio.wrap_future(future.future()) + + async def get_memory_usage(self) -> dict: + """Get memory usage statistics (async)""" + future = self.queue_actor.get_memory_usage.remote() + return await asyncio.wrap_future(future.future()) + + # 为了兼容性,保留同步版本的方法(但标记为deprecated) + def put_sample_sync(self, sample: Any, param_version: int) -> bool: + """Put batch into queue (sync - deprecated, use put_sample instead)""" return ray.get(self.queue_actor.put_sample.remote(sample, param_version)) - def get_samples(self, min_batch_count: int = 1) -> tuple[list[Any], int]: - """Get batch from queue, wait until enough samples are available""" + def get_samples_sync(self, min_batch_count: int = 1) -> tuple[list[Any], int]: + """Get batch from queue (sync - deprecated, use get_samples instead)""" return ray.get(self.queue_actor.get_samples.remote(min_batch_count)) - def get_sample(self) -> Any | None: - """Get single sample from queue, wait until one is available""" - return ray.get(self.queue_actor.get_sample.remote()) - - def update_param_version(self, version: int): - """Update parameter version""" - ray.get(self.queue_actor.update_param_version.remote(version)) - - def get_queue_size(self) -> int: - """Get queue size""" - return ray.get(self.queue_actor.get_queue_size.remote()) - - def get_statistics(self) -> dict[str, Any]: - """Get statistics""" + def get_statistics_sync(self) -> dict[str, Any]: + """Get statistics (sync - deprecated, use get_statistics instead)""" return ray.get(self.queue_actor.get_statistics.remote()) - - def clear_queue(self): - """Clear queue""" - ray.get(self.queue_actor.clear_queue.remote()) - - def shutdown(self): - """Shutdown queue""" - ray.get(self.queue_actor.shutdown.remote()) - - def get_memory_usage(self) -> dict: - """Get memory usage statistics""" - return ray.get(self.queue_actor.get_memory_usage.remote()) diff --git a/recipe/fully_async_policy/unittest/ray_async_resource_config.py b/recipe/fully_async_policy/unittest/ray_async_resource_config.py new file mode 100644 index 00000000000..40e85c9f1bd --- /dev/null +++ b/recipe/fully_async_policy/unittest/ray_async_resource_config.py @@ -0,0 +1,366 @@ +# Copyright 2025 Meituan Ltd. and/or its affiliates +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import asyncio +import random +import time + +import ray + + +# 配置1: 默认配置 +class DefaultStreamingActor: + """默认配置的流式处理Actor""" + + def __init__(self, actor_id: str): + self.actor_id = actor_id + self.processed_count = 0 + self.start_time = time.time() + self.max_concurrent_tasks = 0 + self.current_tasks = 0 + + async def process_data_async(self, data_item: dict) -> dict: + """异步处理数据""" + self.current_tasks += 1 + self.max_concurrent_tasks = max(self.max_concurrent_tasks, self.current_tasks) + + try: + task_id = data_item["id"] + processing_time = random.uniform(1, 3) + + print(f"[{self.actor_id}] 开始处理 {task_id} (当前并发: {self.current_tasks})") + + # CPU密集型任务模拟 + await asyncio.sleep(processing_time * 0.5) # I/O部分 + + # 模拟CPU计算 + total = 0 + for i in range(int(processing_time * 100000)): # CPU密集计算 + total += i * 0.001 + + await asyncio.sleep(processing_time * 0.5) # 更多I/O + + self.processed_count += 1 + + result = { + "id": task_id, + "actor_id": self.actor_id, + "processing_time": processing_time, + "processed_count": self.processed_count, + "max_concurrent": self.max_concurrent_tasks, + "compute_result": total, + "completed_at": time.time(), + } + + print(f"[{self.actor_id}] 完成处理 {task_id} (耗时: {processing_time:.1f}s)") + return result + + finally: + self.current_tasks -= 1 + + def get_stats(self) -> dict: + return { + "actor_id": self.actor_id, + "processed_count": self.processed_count, + "max_concurrent_tasks": self.max_concurrent_tasks, + "uptime": time.time() - self.start_time, + } + + +# 配置2: 只设置 num_cpus +@ray.remote(num_cpus=4) +class HighCpuStreamingActor(DefaultStreamingActor): + """高CPU配置的Actor""" + + pass + + +# 配置3: 只设置 max_concurrency +@ray.remote(max_concurrency=5) +class HighConcurrencyStreamingActor(DefaultStreamingActor): + """高并发配置的Actor""" + + pass + + +# 配置4: 同时设置两者 +@ray.remote(num_cpus=4, max_concurrency=8) +class OptimalStreamingActor(DefaultStreamingActor): + """最优配置的Actor""" + + pass + + +# 配置5: 极端低配置 +@ray.remote(num_cpus=1, max_concurrency=2) +class LowResourceStreamingActor(DefaultStreamingActor): + """低资源配置的Actor""" + + pass + + +class RayStreamingSystemTest: + """Ray流式处理系统测试""" + + def __init__(self): + self.test_data = [] + self.results = {} + + def generate_test_data(self, count: int = 20) -> list[dict]: + """生成测试数据""" + return [ + {"id": f"task_{i:03d}", "content": f"测试数据_{i}", "priority": random.choice(["high", "normal", "low"])} + for i in range(count) + ] + + async def test_actor_configuration(self, actor_class, config_name: str, test_data: list[dict]) -> dict: + """测试特定配置的Actor""" + print(f"\n{'=' * 60}") + print(f"测试配置: {config_name}") + print(f"{'=' * 60}") + + # 创建Actor实例 + actor = actor_class.remote(config_name) + + start_time = time.time() + + # 并发提交所有任务 + print(f"提交 {len(test_data)} 个任务...") + task_futures = [] + + for i, data_item in enumerate(test_data): + future = actor.process_data_async.remote(data_item) + task_futures.append(future) + + # 模拟流式数据到达 + if i < len(test_data) - 1: + await asyncio.sleep(0.1) # 100ms间隔 + + print("所有任务已提交,等待完成...") + + # 等待所有任务完成 + try: + results = await asyncio.gather(*[asyncio.wrap_future(future.future()) for future in task_futures]) + except Exception as e: + print(f"任务执行出错: {e}") + results = [] + + end_time = time.time() + total_time = end_time - start_time + + # 获取Actor统计信息 + stats = ray.get(actor.get_stats.remote()) + + # 计算性能指标 + performance_metrics = { + "config_name": config_name, + "total_tasks": len(test_data), + "completed_tasks": len(results), + "total_time": total_time, + "throughput": len(results) / total_time if total_time > 0 else 0, + "avg_processing_time": sum(r.get("processing_time", 0) for r in results) / len(results) if results else 0, + "max_concurrent_tasks": stats["max_concurrent_tasks"], + "actor_stats": stats, + "success_rate": len(results) / len(test_data) if test_data else 0, + } + + print(f"✅ 完成测试 {config_name}:") + print(f" 总任务数: {performance_metrics['total_tasks']}") + print(f" 完成任务数: {performance_metrics['completed_tasks']}") + print(f" 总耗时: {performance_metrics['total_time']:.2f}s") + print(f" 吞吐量: {performance_metrics['throughput']:.2f} tasks/s") + print(f" 最大并发: {performance_metrics['max_concurrent_tasks']}") + print(f" 成功率: {performance_metrics['success_rate'] * 100:.1f}%") + + return performance_metrics + + async def run_comprehensive_test(self): + """运行综合测试""" + print("🚀 开始Ray异步资源配置测试") + print(f"Ray集群状态: {ray.cluster_resources()}") + + # 生成测试数据 + test_data = self.generate_test_data(15) # 15个任务便于观察 + + # 测试配置列表 + test_configs = [ + (DefaultStreamingActor, "默认配置 (无特殊设置)"), + (HighCpuStreamingActor, "高CPU配置 (num_cpus=4)"), + (HighConcurrencyStreamingActor, "高并发配置 (max_concurrency=5)"), + (OptimalStreamingActor, "最优配置 (num_cpus=4, max_concurrency=8)"), + (LowResourceStreamingActor, "低资源配置 (num_cpus=1, max_concurrency=2)"), + ] + + results = {} + + # 逐个测试各种配置 + for actor_class, config_name in test_configs: + try: + result = await self.test_actor_configuration(actor_class, config_name, test_data) + results[config_name] = result + + # 测试间隔 + await asyncio.sleep(2) + + except Exception as e: + print(f"❌ 测试 {config_name} 失败: {e}") + results[config_name] = {"error": str(e)} + + # 生成对比报告 + self.generate_comparison_report(results) + + return results + + def generate_comparison_report(self, results: dict): + """生成对比报告""" + print(f"\n{'=' * 80}") + print("📊 配置对比报告") + print(f"{'=' * 80}") + + # 表头 + print(f"{'配置名称':<25} {'吞吐量':<12} {'最大并发':<10} {'平均处理时间':<15} {'成功率':<10}") + print("-" * 80) + + # 数据行 + best_throughput = 0 + best_config = "" + + for config_name, result in results.items(): + if "error" in result: + print(f"{config_name:<25} {'错误':<12} {'':<10} {'':<15} {'':<10}") + continue + + throughput = result.get("throughput", 0) + max_concurrent = result.get("max_concurrent_tasks", 0) + avg_time = result.get("avg_processing_time", 0) + success_rate = result.get("success_rate", 0) + + print( + f"{config_name:<25} {throughput:<12.2f} {max_concurrent:<10} " + f"{avg_time:<15.2f} {success_rate * 100:<10.1f}%" + ) + + if throughput > best_throughput: + best_throughput = throughput + best_config = config_name + + print(f"\n🏆 最佳配置: {best_config} (吞吐量: {best_throughput:.2f} tasks/s)") + + # 详细分析 + print("\n📋 配置分析:") + print("1. num_cpus 作用:") + print(" - 资源预留: 确保Actor有足够计算资源") + print(" - 节点选择: Ray选择有足够CPU的节点") + print(" - 避免资源竞争: 防止过度调度") + + print("\n2. max_concurrency 作用:") + print(" - 并发控制: 限制Actor内同时执行的任务数") + print(" - 内存保护: 防止过多并发导致内存溢出") + print(" - 性能调优: 平衡并发度和资源利用率") + + print("\n3. 建议配置:") + print(" - CPU密集型任务: 设置较高的num_cpus,适中的max_concurrency") + print(" - I/O密集型任务: 设置较低的num_cpus,较高的max_concurrency") + print(" - 混合型任务: 平衡两个参数,根据实际测试调优") + + +async def run_resource_stress_test(): + """运行资源压力测试""" + print(f"\n{'=' * 60}") + print("🔥 资源压力测试") + print(f"{'=' * 60}") + + # 创建多个不同配置的Actor + actors = { + "高并发低CPU": OptimalStreamingActor.remote("stress_test_1"), + "低并发高CPU": ray.remote(num_cpus=8, max_concurrency=2)(DefaultStreamingActor).remote("stress_test_2"), + "平衡配置": ray.remote(num_cpus=2, max_concurrency=4)(DefaultStreamingActor).remote("stress_test_3"), + } + + # 大量并发任务 + heavy_workload = [{"id": f"heavy_{i}", "content": f"重载任务_{i}"} for i in range(50)] + + print("提交大量并发任务,观察资源使用...") + + all_futures = [] + for actor_name, actor in actors.items(): + print(f"向 {actor_name} 提交任务...") + for task in heavy_workload[:15]: # 每个Actor处理15个任务 + future = actor.process_data_async.remote(task) + all_futures.append((actor_name, future)) + + # 等待完成并记录时间 + start_time = time.time() + results = [] + + for actor_name, future in all_futures: + try: + result = await asyncio.wrap_future(future.future()) + results.append((actor_name, result)) + except Exception as e: + print(f"{actor_name} 任务失败: {e}") + + end_time = time.time() + + print(f"压力测试完成,总耗时: {end_time - start_time:.2f}s") + print(f"完成任务数: {len(results)}") + + # 按Actor分组统计 + actor_stats = {} + for actor_name, result in results: + if actor_name not in actor_stats: + actor_stats[actor_name] = [] + actor_stats[actor_name].append(result) + + for actor_name, actor_results in actor_stats.items(): + avg_time = sum(r["processing_time"] for r in actor_results) / len(actor_results) + print(f"{actor_name}: 完成 {len(actor_results)} 个任务, 平均耗时 {avg_time:.2f}s") + + +async def main(): + """主函数""" + # 初始化Ray + if not ray.is_initialized(): + ray.init( + num_cpus=16, # 设置足够的CPU资源 + object_store_memory=2000000000, # 2GB + ignore_reinit_error=True, + ) + + print("🎯 Ray异步资源配置测试") + print(f"可用资源: {ray.cluster_resources()}") + + try: + # 基础配置测试 + test_system = RayStreamingSystemTest() + await test_system.run_comprehensive_test() + + # 压力测试 + await run_resource_stress_test() + + print("\n✅ 所有测试完成!") + + except Exception as e: + print(f"❌ 测试执行失败: {e}") + import traceback + + traceback.print_exc() + + finally: + # 清理资源 + ray.shutdown() + + +if __name__ == "__main__": + asyncio.run(main()) diff --git a/recipe/fully_async_policy/simple_streaming_demo.py b/recipe/fully_async_policy/unittest/simple_streaming_demo.py similarity index 100% rename from recipe/fully_async_policy/simple_streaming_demo.py rename to recipe/fully_async_policy/unittest/simple_streaming_demo.py diff --git a/recipe/fully_async_policy/unittest/test_asyncio_message_queue.py b/recipe/fully_async_policy/unittest/test_asyncio_message_queue.py new file mode 100644 index 00000000000..33e0d9db04d --- /dev/null +++ b/recipe/fully_async_policy/unittest/test_asyncio_message_queue.py @@ -0,0 +1,407 @@ +# Copyright 2025 Meituan Ltd. and/or its affiliates +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# 测试使用 asyncio 的 MessageQueue +# 对比 @ray.remote(num_cpus, max_concurrency) 参数的实际效果 + +import asyncio +import random + +# 导入修改后的 MessageQueue +import time +from dataclasses import dataclass + +import ray +from omegaconf import DictConfig + +from recipe.fully_async_policy.message_queue import MessageQueue, MessageQueueClient, QueueSample + + +@dataclass +class TestConfig: + """测试配置""" + + async_training: dict + + +def create_test_config() -> DictConfig: + """创建测试配置""" + from omegaconf import OmegaConf + + config_dict = {"async_training": {"staleness_threshold": 3}} + return OmegaConf.create(config_dict) + + +class AsyncMessageQueueTester: + """异步消息队列测试器""" + + def __init__(self): + self.config = create_test_config() + + async def test_basic_async_operations(self): + """测试基本异步操作""" + print("\n🧪 测试基本异步操作") + print("=" * 50) + + # 创建MessageQueue Actor + queue_actor = MessageQueue.remote(self.config, max_queue_size=100) + client = MessageQueueClient(queue_actor) + + # 测试异步放入样本 + test_samples = [ + QueueSample( + data={"task_id": f"task_{i}", "content": f"测试数据_{i}"}, + rollout_metadata={"timestamp": time.time(), "version": 1}, + ) + for i in range(10) + ] + + # 异步并发放入样本 + put_tasks = [] + for i, sample in enumerate(test_samples): + task = asyncio.create_task(client.put_sample(sample, param_version=1), name=f"put_task_{i}") + put_tasks.append(task) + + # 等待所有放入任务完成 + put_results = await asyncio.gather(*put_tasks) + successful_puts = sum(put_results) + + print(f"✅ 成功放入 {successful_puts}/{len(test_samples)} 个样本") + + # 异步获取统计信息 + stats = await client.get_statistics() + print(f"📊 队列统计: {stats}") + + # 异步获取样本 + samples_batch, queue_size = await client.get_samples(min_batch_count=5) + print(f"📦 获取了 {len(samples_batch)} 个样本,剩余队列大小: {queue_size}") + + # 清理 + await client.shutdown() + + return successful_puts + + async def test_concurrent_producers_consumers(self): + """测试并发生产者和消费者""" + print("\n🏭 测试并发生产者和消费者") + print("=" * 50) + + # 创建 MessageQueue Actor + queue_actor = MessageQueue.remote(self.config, max_queue_size=200) + client = MessageQueueClient(queue_actor) + + # 生产者协程 + async def producer(producer_id: int, sample_count: int): + """生产者协程""" + produced = 0 + for i in range(sample_count): + sample = QueueSample( + data={ + "producer_id": producer_id, + "task_id": f"producer_{producer_id}_task_{i}", + "content": f"来自生产者{producer_id}的数据{i}", + }, + rollout_metadata={"producer_timestamp": time.time(), "producer_id": producer_id}, + ) + + success = await client.put_sample(sample, param_version=1) + if success: + produced += 1 + + # 模拟生产间隔 + await asyncio.sleep(random.uniform(0.01, 0.1)) + + print(f"🏭 生产者{producer_id} 完成,成功生产 {produced} 个样本") + return produced + + # 消费者协程 + async def consumer(consumer_id: int, target_count: int): + """消费者协程""" + consumed = 0 + start_time = time.time() + + while consumed < target_count: + try: + # 尝试获取样本,设置超时 + sample = await asyncio.wait_for(client.get_sample(), timeout=2.0) + + if sample is not None: + consumed += 1 + + if consumed % 10 == 0: + print(f"🍽️ 消费者{consumer_id} 已消费 {consumed} 个样本") + else: + print(f"⚠️ 消费者{consumer_id} 收到空样本,队列可能已关闭") + break + + except asyncio.TimeoutError: + print(f"⏰ 消费者{consumer_id} 超时,检查队列状态...") + stats = await client.get_statistics() + if stats["queue_size"] == 0: + print(f"📭 队列为空,消费者{consumer_id} 等待...") + await asyncio.sleep(0.5) + continue + + # 模拟处理时间 + await asyncio.sleep(random.uniform(0.02, 0.05)) + + elapsed = time.time() - start_time + print(f"🍽️ 消费者{consumer_id} 完成,消费了 {consumed} 个样本,耗时 {elapsed:.2f}s") + return consumed + + # 启动并发生产者和消费者 + num_producers = 3 + num_consumers = 2 + samples_per_producer = 20 + + # 创建生产者任务 + producer_tasks = [ + asyncio.create_task(producer(i, samples_per_producer), name=f"producer_{i}") for i in range(num_producers) + ] + + # 创建消费者任务 + total_expected_samples = num_producers * samples_per_producer + samples_per_consumer = total_expected_samples // num_consumers + + consumer_tasks = [ + asyncio.create_task( + consumer(i, samples_per_consumer + (5 if i == 0 else 0)), # 第一个消费者多处理一些 + name=f"consumer_{i}", + ) + for i in range(num_consumers) + ] + + # 等待所有任务完成 + start_time = time.time() + + producer_results = await asyncio.gather(*producer_tasks, return_exceptions=True) + consumer_results = await asyncio.gather(*consumer_tasks, return_exceptions=True) + + end_time = time.time() + + # 统计结果 + total_produced = sum(r for r in producer_results if isinstance(r, int)) + total_consumed = sum(r for r in consumer_results if isinstance(r, int)) + + print("\n📈 并发测试结果:") + print(f" 总生产样本: {total_produced}") + print(f" 总消费样本: {total_consumed}") + print(f" 总耗时: {end_time - start_time:.2f}s") + print(f" 生产效率: {total_produced / (end_time - start_time):.2f} samples/s") + print(f" 消费效率: {total_consumed / (end_time - start_time):.2f} samples/s") + + # 最终统计 + final_stats = await client.get_statistics() + print(f"📊 最终队列统计: {final_stats}") + + # 清理 + await client.shutdown() + + return total_produced, total_consumed + + async def compare_resource_configurations(self): + """对比不同资源配置的效果""" + print("\n⚡ 对比不同资源配置的效果") + print("=" * 50) + + # 测试配置列表 + configs = [ + {"name": "默认配置", "num_cpus": None, "max_concurrency": None, "decorator": ray.remote}, + { + "name": "高CPU低并发", + "num_cpus": 4, + "max_concurrency": 5, + "decorator": lambda: ray.remote(num_cpus=4, max_concurrency=5), + }, + { + "name": "低CPU高并发", + "num_cpus": 1, + "max_concurrency": 20, + "decorator": lambda: ray.remote(num_cpus=1, max_concurrency=20), + }, + { + "name": "平衡配置", + "num_cpus": 2, + "max_concurrency": 10, + "decorator": lambda: ray.remote(num_cpus=2, max_concurrency=10), + }, + ] + + results = {} + + for config in configs: + print(f"\n🧪 测试配置: {config['name']}") + print(f" num_cpus: {config['num_cpus']}") + print(f" max_concurrency: {config['max_concurrency']}") + + # 动态创建MessageQueue类 + if config["num_cpus"] is None: + QueueClass = MessageQueue + else: + QueueClass = config["decorator"]()(MessageQueue) + + # 创建queue实例 + queue_actor = QueueClass.remote(self.config, max_queue_size=100) + client = MessageQueueClient(queue_actor) + + # 执行性能测试 + start_time = time.time() + + # 并发放入大量样本 + sample_count = 50 + put_tasks = [] + + for i in range(sample_count): + sample = QueueSample( + data={ + "task_id": f"perf_test_{i}", + "config": config["name"], + "data_size": random.randint(100, 1000), + }, + rollout_metadata={"config_test": True}, + ) + + task = asyncio.create_task(client.put_sample(sample, param_version=1)) + put_tasks.append(task) + + # 模拟流式到达 + if i % 10 == 0: + await asyncio.sleep(0.01) + + # 等待所有put完成 + put_results = await asyncio.gather(*put_tasks) + put_time = time.time() - start_time + + # 获取所有样本 + get_start_time = time.time() + all_samples = [] + + while True: + samples_batch, queue_size = await client.get_samples(min_batch_count=1) + if not samples_batch: + break + all_samples.extend(samples_batch) + + if queue_size == 0: + break + + get_time = time.time() - get_start_time + total_time = time.time() - start_time + + successful_puts = sum(put_results) + + # 记录结果 + results[config["name"]] = { + "successful_puts": successful_puts, + "retrieved_samples": len(all_samples), + "put_time": put_time, + "get_time": get_time, + "total_time": total_time, + "put_throughput": successful_puts / put_time if put_time > 0 else 0, + "get_throughput": len(all_samples) / get_time if get_time > 0 else 0, + "total_throughput": (successful_puts + len(all_samples)) / total_time if total_time > 0 else 0, + } + + print(f" ✅ 放入: {successful_puts}/{sample_count}") + print(f" 📦 获取: {len(all_samples)}") + print(f" ⏱️ 放入耗时: {put_time:.3f}s") + print(f" ⏱️ 获取耗时: {get_time:.3f}s") + print(f" 🚀 放入吞吐量: {successful_puts / put_time:.2f} ops/s") + + # 清理 + await client.shutdown() + + # 间隔 + await asyncio.sleep(1) + + # 生成对比报告 + print("\n📊 资源配置对比报告") + print("=" * 80) + print(f"{'配置名称':<15} {'放入吞吐量':<12} {'获取吞吐量':<12} {'总吞吐量':<12} {'总耗时':<10}") + print("-" * 80) + + best_config = "" + best_throughput = 0 + + for config_name, result in results.items(): + put_throughput = result["put_throughput"] + get_throughput = result["get_throughput"] + total_throughput = result["total_throughput"] + total_time = result["total_time"] + + print( + f"{config_name:<15} {put_throughput:<12.2f} {get_throughput:<12.2f} " + f"{total_throughput:<12.2f} {total_time:<10.3f}s" + ) + + if total_throughput > best_throughput: + best_throughput = total_throughput + best_config = config_name + + print(f"\n🏆 最佳配置: {best_config} (总吞吐量: {best_throughput:.2f} ops/s)") + + return results + + +async def main(): + """主函数""" + # 初始化Ray + if not ray.is_initialized(): + ray.init( + num_cpus=8, + object_store_memory=1000000000, # 1GB + ignore_reinit_error=True, + ) + + print("🎯 异步MessageQueue测试") + print(f"Ray集群资源: {ray.cluster_resources()}") + + tester = AsyncMessageQueueTester() + + try: + # 基本异步操作测试 + await tester.test_basic_async_operations() + + # 并发生产者消费者测试 + await tester.test_concurrent_producers_consumers() + + # 资源配置对比测试 + await tester.compare_resource_configurations() + + print("\n✅ 所有测试完成!") + + # 总结 + print("\n📋 总结:") + print("1. 使用 asyncio 后的优势:") + print(" - 真正的异步等待,不阻塞事件循环") + print(" - 更好的并发性能") + print(" - 与Ray的异步接口完美集成") + + print("\n2. 资源配置建议:") + print(" - num_cpus: 控制CPU资源分配,影响计算密集型任务") + print(" - max_concurrency: 控制并发数,影响I/O密集型任务") + print(" - 对于MessageQueue: 推荐 num_cpus=2, max_concurrency=20") + + except Exception as e: + print(f"❌ 测试失败: {e}") + import traceback + + traceback.print_exc() + + finally: + ray.shutdown() + + +if __name__ == "__main__": + asyncio.run(main()) diff --git a/verl/experimental/agent_loop/agent_loop.py b/verl/experimental/agent_loop/agent_loop.py index 34f4c78833c..b658526b7d7 100644 --- a/verl/experimental/agent_loop/agent_loop.py +++ b/verl/experimental/agent_loop/agent_loop.py @@ -470,18 +470,6 @@ async def get_trajectory_info(step, index, validate): return trajectory_info -async def _ray_future_to_asyncio(ray_future): - """将Ray future转换为asyncio可等待的对象""" - while True: - try: - # 非阻塞检查Ray future是否完成 - result = ray.get(ray_future, timeout=0.001) # 1ms timeout - return result - except ray.exceptions.GetTimeoutError: - # 未完成,让出控制权给其他协程 - await asyncio.sleep(1) # 1s sleep - - class AgentLoopManager: """Agent loop manager that manages a group of agent loop workers.""" @@ -551,10 +539,17 @@ def _initialize_llm_servers(self): def _init_agent_loop_workers(self): self.agent_loop_workers = [] - for i in range(self.config.actor_rollout_ref.rollout.agent.num_workers): + # 获取建议的资源配置 + agent_config = self.config.actor_rollout_ref.rollout.agent + max_concurrency = agent_config.get("max_concurrency", 10) + num_cpus = agent_config.get("num_cpus", 2) # 默认2个CPU核心 + + for i in range(agent_config.num_workers): self.agent_loop_workers.append( AgentLoopWorker.options( name=f"agent_loop_worker_{i}", + max_concurrency=max_concurrency, # 设置最大并发数 + num_cpus=num_cpus, # 设置CPU资源需求 ).remote(self.config, self.async_llm_servers) ) @@ -603,9 +598,9 @@ async def generate_single_sample_async(self, sample: DataProto, sample_id: str) # 使用负载均衡选择 worker worker = self._select_best_worker() - # 异步处理单个样本 - output_future = worker.generate_sequences.remote(sample) - outputs = await _ray_future_to_asyncio(output_future) + # 异步处理单个样本 - 使用无后处理版本获取原始AgentLoopOutput + output_future = worker.generate_sequences_no_post.remote(sample) + outputs = await asyncio.wrap_future(output_future.future()) processing_time = time.time() - start_time From a59b84f5b2a74690f9aba964da7b98d373b563d3 Mon Sep 17 00:00:00 2001 From: ArronHZG Date: Thu, 14 Aug 2025 11:38:35 +0800 Subject: [PATCH 044/182] fix ray train bug --- verl/trainer/ppo/ray_trainer.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/verl/trainer/ppo/ray_trainer.py b/verl/trainer/ppo/ray_trainer.py index 26150cc631d..d16c2736bcc 100644 --- a/verl/trainer/ppo/ray_trainer.py +++ b/verl/trainer/ppo/ray_trainer.py @@ -913,7 +913,7 @@ def _init_models(self): self.rm_wg.init_model() # we should create rollout at the end so that vllm can have a better estimation of kv cache memory - self.actor_rollout_wg = self.all_wg[Role.ActorRollout] + self.actor_rollout_wg = self.all_wg[str(Role.ActorRollout)] self.actor_rollout_wg.init_model() def _init_async_rollout_manager(self): From 191605b5edda82a82ad260ad429c518ed41ffe9d Mon Sep 17 00:00:00 2001 From: ArronHZG Date: Thu, 14 Aug 2025 20:48:48 +0800 Subject: [PATCH 045/182] async server --- recipe/fully_async_policy/fully_async_main.py | 31 +++---- .../fully_async_rollouter.py | 50 +++++------ .../fully_async_policy/fully_async_trainer.py | 48 +++++------ recipe/fully_async_policy/utils.py | 19 +++++ recipe/one_step_off_policy/fsdp_workers.py | 46 ++++++---- recipe/one_step_off_policy/main_ppo.py | 33 +++----- .../one_step_off_policy/megatron_workers.py | 83 +++++++++---------- ...harding_manager.py => sharding_manager.py} | 8 +- verl/experimental/agent_loop/agent_loop.py | 2 +- verl/trainer/ppo/ray_trainer.py | 8 +- .../rollout/vllm_rollout/vllm_async_server.py | 19 ++++- .../rollout/vllm_rollout/vllm_rollout_spmd.py | 27 +++--- 12 files changed, 201 insertions(+), 173 deletions(-) create mode 100644 recipe/fully_async_policy/utils.py rename recipe/one_step_off_policy/{vllm_sharding_manager.py => sharding_manager.py} (94%) diff --git a/recipe/fully_async_policy/fully_async_main.py b/recipe/fully_async_policy/fully_async_main.py index 179929f242a..0e43bd6151b 100644 --- a/recipe/fully_async_policy/fully_async_main.py +++ b/recipe/fully_async_policy/fully_async_main.py @@ -82,39 +82,29 @@ def create_role_worker_mapping(config): if config.actor_rollout_ref.actor.strategy == "fsdp2": assert config.actor_rollout_ref.actor.strategy == config.critic.strategy from recipe.one_step_off_policy.fsdp_workers import ( - ActorRolloutRefWorker, - AsyncActorRolloutRefWorker, + DetachActorWorker, + DetachAsyncRolloutWorker, CriticWorker, - RolloutWorker, ) from verl.single_controller.ray import RayWorkerGroup - - actor_rollout_cls = ( - AsyncActorRolloutRefWorker if config.actor_rollout_ref.rollout.mode == "async" else ActorRolloutRefWorker - ) ray_worker_group_cls = RayWorkerGroup elif config.actor_rollout_ref.actor.strategy == "megatron": assert config.actor_rollout_ref.actor.strategy == config.critic.strategy from recipe.one_step_off_policy.megatron_workers import ( - ActorRolloutRefWorker, - AsyncActorRolloutRefWorker, + DetachActorWorker, + DetachAsyncRolloutWorker, CriticWorker, - RolloutWorker, ) from verl.single_controller.ray.megatron import NVMegatronRayWorkerGroup - - actor_rollout_cls = ( - AsyncActorRolloutRefWorker if config.actor_rollout_ref.rollout.mode == "async" else ActorRolloutRefWorker - ) ray_worker_group_cls = NVMegatronRayWorkerGroup else: raise NotImplementedError(f"Unsupported strategy: {config.actor_rollout_ref.actor.strategy}") role_worker_mapping = { - Role.Actor: ray.remote(actor_rollout_cls), - Role.Rollout: ray.remote(RolloutWorker), + Role.Actor: ray.remote(DetachActorWorker), + Role.Rollout: ray.remote(DetachAsyncRolloutWorker), Role.Critic: ray.remote(CriticWorker), } @@ -187,11 +177,10 @@ def _initialize_components(self, config) -> None: self.components["reward_fn"] = reward_fn self.components["val_reward_fn"] = val_reward_fn - self.max_queue_size = ( - (config.async_training.staleness_threshold + 1) - * config.data.train_batch_size - * config.actor_rollout_ref.rollout.n - ) * 10 # x 10 avoid deadlock + self.max_queue_size = ((config.async_training.staleness_threshold + 1) + * config.data.train_batch_size + * config.actor_rollout_ref.rollout.n + ) * 10 # x 10 avoid deadlock print("[ASYNC MAIN] Creating MessageQueue...") message_queue = MessageQueue.remote(config, self.max_queue_size) message_queue_client = MessageQueueClient(message_queue) diff --git a/recipe/fully_async_policy/fully_async_rollouter.py b/recipe/fully_async_policy/fully_async_rollouter.py index 0060cfa1b02..5f6f2b0d589 100644 --- a/recipe/fully_async_policy/fully_async_rollouter.py +++ b/recipe/fully_async_policy/fully_async_rollouter.py @@ -19,6 +19,7 @@ from omegaconf import OmegaConf from recipe.fully_async_policy.message_queue import MessageQueueClient, QueueSample +from recipe.fully_async_policy.utils import calculate_one_step_size from verl.single_controller.ray import RayClassWithInitArgs, RayWorkerGroup from verl.trainer.ppo.ray_trainer import RayPPOTrainer, ResourcePoolManager, Role, WorkerType from verl.utils.tracking import ValidationGenerationsLogger @@ -33,17 +34,17 @@ class FullyAsyncRollouter(RayPPOTrainer): """ def __init__( - self, - config, - tokenizer, - role_worker_mapping: dict[Role, WorkerType], - resource_pool_manager: ResourcePoolManager, - ray_worker_group_cls: RayWorkerGroup = RayWorkerGroup, - processor=None, - reward_fn=None, - val_reward_fn=None, - device_name=None, - max_queue_size=1000, + self, + config, + tokenizer, + role_worker_mapping: dict[Role, WorkerType], + resource_pool_manager: ResourcePoolManager, + ray_worker_group_cls: RayWorkerGroup = RayWorkerGroup, + processor=None, + reward_fn=None, + val_reward_fn=None, + device_name=None, + max_queue_size=1000, ): # Store the tokenizer for text processing self.tokenizer = tokenizer @@ -53,7 +54,11 @@ def __init__( self.val_reward_fn = val_reward_fn self.hybrid_engine = config.actor_rollout_ref.hybrid_engine + assert not self.hybrid_engine + assert self.config.data.train_batch_size == 0, "train_batch_size must be zero" + assert self.config.data.gen_batch_size == 1, "gen_batch_size must be one" + self.role_worker_mapping = role_worker_mapping self.resource_pool_manager = resource_pool_manager @@ -81,16 +86,11 @@ def __init__( self._validate_config() print(f"[FullyAsyncRollouter] Rollouter _create_dataloader...\n{train_dataset}\n{val_dataset}") - assert self.config.data.gen_batch_size == 1, "gen_batch_size must be one" - self._create_dataloader(train_dataset, val_dataset, collate_fn, train_sampler) - total_rollout_steps = len(self.train_dataloader) * self.config.trainer.total_epochs - + self.total_rollout_steps = len(self.train_dataloader) * self.config.trainer.total_epochs if self.config.rollout.total_rollout_steps is not None: - total_rollout_steps = self.config.rollout.total_rollout_steps - - self.total_rollout_steps = total_rollout_steps + self.total_rollout_steps = min(self.config.rollout.total_rollout_steps, self.total_rollout_steps) print(f"[FullyAsyncRollouter] Total rollout steps: {self.total_rollout_steps}") # Rollouter parameter configuration @@ -107,12 +107,6 @@ def __init__( self.train_step_samples = 0 self.dropped_stale_samples = 0 - # Calculate the samples needed for a train, used to calculate staleness and interrupt rollout - n_responses_per_prompt = self.config.actor_rollout_ref.rollout.n - batch_size = self.config.data.train_batch_size - required_samples = n_responses_per_prompt * batch_size - self.max_required_samples = required_samples * (self.staleness_threshold + 1) - # Worker groups self.rollout_wg = None self.message_queue_client = None @@ -145,6 +139,12 @@ def __init__( self.active_sample_count = 0 # 当前正在处理的样本数 self.queue_full_pause_count = 0 # 队列满导致的暂停次数 + # Calculate the samples needed for a train, used to calculate staleness and interrupt rollout + self.required_samples = calculate_one_step_size(self.minimal_bsz, + config.actor_rollout_ref.actor.ppo_mini_batch_size) + self.max_required_samples = self.required_samples * (self.staleness_threshold + 1) + + async def set_message_queue_client(self, message_queue_client: MessageQueueClient): """Set message queue client""" async with self.lock: @@ -173,6 +173,8 @@ def _validate_config(self): if not hasattr(self.config, "async_training"): raise ValueError("[FullyAsyncRollouter] Missing async_training configuration") + super()._validate_config() + def _create_actor_rollout_classes(self): # only create rollout for role in [Role.Rollout]: diff --git a/recipe/fully_async_policy/fully_async_trainer.py b/recipe/fully_async_policy/fully_async_trainer.py index d9883aaf33f..21b2eda259e 100644 --- a/recipe/fully_async_policy/fully_async_trainer.py +++ b/recipe/fully_async_policy/fully_async_trainer.py @@ -22,6 +22,8 @@ from omegaconf import OmegaConf from recipe.fully_async_policy.message_queue import MessageQueueClient, QueueSample +from recipe.fully_async_policy.utils import calculate_one_step_size +from verl.experimental.agent_loop.agent_loop import postprocess_agent_loop_outputs from verl.single_controller.ray import RayClassWithInitArgs, RayWorkerGroup from verl.trainer.ppo import core_algos from verl.trainer.ppo.core_algos import AdvantageEstimator @@ -44,16 +46,16 @@ class FullyAsyncTrainer(RayPPOTrainer): """ def __init__( - self, - config, - tokenizer, - role_worker_mapping: dict[Role, WorkerType], - resource_pool_manager: ResourcePoolManager, - ray_worker_group_cls: RayWorkerGroup = RayWorkerGroup, - processor=None, - reward_fn=None, - val_reward_fn=None, - device_name=None, + self, + config, + tokenizer, + role_worker_mapping: dict[Role, WorkerType], + resource_pool_manager: ResourcePoolManager, + ray_worker_group_cls: RayWorkerGroup = RayWorkerGroup, + processor=None, + reward_fn=None, + val_reward_fn=None, + device_name=None, ): # Store the tokenizer for text processing self.tokenizer = tokenizer @@ -102,6 +104,9 @@ def __init__( self.stale_samples_processed = 0 self.current_param_version = 0 + self.required_samples = calculate_one_step_size(self.minimal_bsz, + config.actor_rollout_ref.actor.ppo_mini_batch_size) + def set_message_queue_client(self, message_queue_client: MessageQueueClient): """Set message queue client""" self.message_queue_client = message_queue_client @@ -122,14 +127,9 @@ def _get_samples_from_queue(self) -> tuple[None, None] | tuple[int, Any]: Returns: tuple: (epoch, batch_dict, gen_batch_output) """ - # Calculate the number of samples needed - n_responses_per_prompt = self.config.actor_rollout_ref.rollout.n - batch_size = self.config.data.train_batch_size - required_samples = n_responses_per_prompt * batch_size - print( "[FullyAsyncTrainer] " - f"Requesting {required_samples} samples from queue (n={n_responses_per_prompt}, batch_size={batch_size})", + f"Requesting {self.required_samples} samples from queue", flush=True, ) @@ -137,9 +137,7 @@ def _get_samples_from_queue(self) -> tuple[None, None] | tuple[int, Any]: consumer_start = time.time() queue_samples = [] - print(f"[FullyAsyncTrainer] Starting sample collection loop, required={required_samples}") - - while len(queue_samples) < required_samples: + while len(queue_samples) < self.required_samples: # 获取单个样本,会一直等待直到有样本或收到None sample = self.message_queue_client.get_sample() @@ -147,23 +145,23 @@ def _get_samples_from_queue(self) -> tuple[None, None] | tuple[int, Any]: # 检测到结束信号(None),立即退出 logger.info( f"Detected termination signal (None), stopping sample collection. " - f"Collected {len(queue_samples)}/{required_samples} samples" + f"Collected {len(queue_samples)}/{self.required_samples} samples" ) break queue_samples.append(sample) - if len(queue_samples) % 10 == 0 or len(queue_samples) >= required_samples: - print(f"[FullyAsyncTrainer] Collected {len(queue_samples)}/{required_samples} samples") + if len(queue_samples) % 10 == 0 or len(queue_samples) >= self.required_samples: + print(f"[FullyAsyncTrainer] Collected {len(queue_samples)}/{self.required_samples} samples") consumer_end = time.time() - if not queue_samples or len(queue_samples) < required_samples: + if not queue_samples or len(queue_samples) < self.required_samples: logger.warning("not enough samples collected after loop") return None, None print( - f"[FullyAsyncTrainer] Loop collection completed: {len(queue_samples)}/{required_samples} samples, " + f"[FullyAsyncTrainer] Loop collection completed: {len(queue_samples)}/{self.required_samples} samples, " f"total wait time: {consumer_end - consumer_start:.2f} seconds" ) @@ -206,7 +204,7 @@ def _assemble_gen_batch_output_from_queue_samples(self, queue_samples: list[Queu # Use the static method to postprocess AgentLoopOutput list into DataProto from verl.experimental.agent_loop.agent_loop import AgentLoopWorker - batch = AgentLoopWorker.postprocess_agent_loop_outputs(agent_loop_outputs, self.tokenizer, self.config) + batch = postprocess_agent_loop_outputs(agent_loop_outputs, self.tokenizer, self.config) # Apply _post_generate_batch logic here batch = self._post_generate_batch_for_agent_outputs(batch, agent_loop_outputs) diff --git a/recipe/fully_async_policy/utils.py b/recipe/fully_async_policy/utils.py new file mode 100644 index 00000000000..71ae7c7d16d --- /dev/null +++ b/recipe/fully_async_policy/utils.py @@ -0,0 +1,19 @@ +# Copyright 2025 Meituan Ltd. and/or its affiliates +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +# Calculate the number of samples needed + +def calculate_one_step_size(minimal_bsz, ppo_mini_batch_size): + return minimal_bsz * ppo_mini_batch_size diff --git a/recipe/one_step_off_policy/fsdp_workers.py b/recipe/one_step_off_policy/fsdp_workers.py index 0aa21991708..e6ab9d1c241 100644 --- a/recipe/one_step_off_policy/fsdp_workers.py +++ b/recipe/one_step_off_policy/fsdp_workers.py @@ -39,7 +39,7 @@ from verl.utils.import_utils import import_external_libs from verl.utils.model import get_generation_config, update_model_config from verl.utils.vllm_utils import patch_vllm_moe_model_weight_loader -from verl.workers.fsdp_workers import ActorRolloutRefWorker as ARRWorker +from verl.workers.fsdp_workers import ActorRolloutRefWorker, AsyncActorRolloutRefWorker from verl.workers.fsdp_workers import CriticWorker logger = logging.getLogger(__file__) @@ -47,19 +47,13 @@ device_name = get_device_name() -__all__ = ["ActorRolloutRefWorker", "AsyncActorRolloutRefWorker", "CriticWorker", "RolloutWorker"] +__all__ = ["DetachActorWorker", "DetachRolloutWorker", "DetachAsyncRolloutWorker", "CriticWorker"] -class ActorRolloutRefWorker(ARRWorker): - def _get_actor_params(self): - assert self._is_actor - params = self.actor_module_fsdp.state_dict() - from verl.utils.model import convert_weight_keys +class DetachNcclSync(ActorRolloutRefWorker): - params = convert_weight_keys( - params, getattr(self.actor_module_fsdp, "_fsdp_wrapped_module", self.actor_module_fsdp) - ) - return params + def _get_actor_params(self): + pass @register(dispatch_mode=Dispatch.ONE_TO_ALL, blocking=False) def sync_rollout_weights(self): @@ -108,7 +102,19 @@ def get_actor_weights_info(self): return ret -class RolloutWorker(ActorRolloutRefWorker): +class DetachActorWorker(DetachNcclSync): + def _get_actor_params(self): + assert self._is_actor + params = self.actor_module_fsdp.state_dict() + from verl.utils.model import convert_weight_keys + + params = convert_weight_keys( + params, getattr(self.actor_module_fsdp, "_fsdp_wrapped_module", self.actor_module_fsdp) + ) + return params + + +class DetachRolloutWorker(DetachNcclSync): def __init__(self, config: DictConfig, role: str): Worker.__init__(self) assert role == "rollout" @@ -202,9 +208,9 @@ def init_model(self): trust_remote_code=trust_remote_code, ) log_gpu_memory_usage(f"After building {rollout_name} rollout", logger=logger) - from .vllm_sharding_manager import VLLMShardingManager - rollout_sharding_manager = VLLMShardingManager( + from sharding_manager import DetachShardingManager + rollout_sharding_manager = DetachShardingManager( inference_engine=rollout.inference_engine, device_mesh=rollout_device_mesh ) @@ -223,6 +229,12 @@ def set_actor_weights_info(self, weights_info): self._weights_info = weights_info -class AsyncActorRolloutRefWorker(ActorRolloutRefWorker): - def __init__(self, *args, **kwargs): - raise NotImplementedError +class DetachAsyncRolloutWorker(AsyncActorRolloutRefWorker, DetachRolloutWorker): + def __init__(self, config: DictConfig, role: str): + print(f"[DetachAsyncRolloutWorker] {DetachAsyncRolloutWorker.__mro__}") + DetachRolloutWorker.__init__(self, config, role) + + @register(dispatch_mode=Dispatch.ONE_TO_ALL) + def init_model(self): + print(f"[DetachAsyncRolloutWorker] init_model") + DetachRolloutWorker.init_model(self) diff --git a/recipe/one_step_off_policy/main_ppo.py b/recipe/one_step_off_policy/main_ppo.py index 0a037df17fa..d9d8f0bb849 100644 --- a/recipe/one_step_off_policy/main_ppo.py +++ b/recipe/one_step_off_policy/main_ppo.py @@ -62,17 +62,11 @@ def run(self, config): assert config.actor_rollout_ref.actor.strategy == config.critic.strategy from verl.single_controller.ray import RayWorkerGroup - from .fsdp_workers import ( - ActorRolloutRefWorker, - AsyncActorRolloutRefWorker, + from recipe.one_step_off_policy.fsdp_workers import ( + DetachActorWorker, + DetachRolloutWorker, + DetachAsyncRolloutWorker, CriticWorker, - RolloutWorker, - ) - - actor_rollout_cls = ( - AsyncActorRolloutRefWorker - if config.actor_rollout_ref.rollout.mode == "async" - else ActorRolloutRefWorker ) ray_worker_group_cls = RayWorkerGroup @@ -80,17 +74,11 @@ def run(self, config): assert config.actor_rollout_ref.actor.strategy == config.critic.strategy from verl.single_controller.ray.megatron import NVMegatronRayWorkerGroup - from .megatron_workers import ( - ActorRolloutRefWorker, - AsyncActorRolloutRefWorker, + from recipe.one_step_off_policy.megatron_workers import ( + DetachActorWorker, + DetachRolloutWorker, + DetachAsyncRolloutWorker, CriticWorker, - RolloutWorker, - ) - - actor_rollout_cls = ( - AsyncActorRolloutRefWorker - if config.actor_rollout_ref.rollout.mode == "async" - else ActorRolloutRefWorker ) ray_worker_group_cls = NVMegatronRayWorkerGroup @@ -100,8 +88,9 @@ def run(self, config): from .ray_trainer import ResourcePoolManager, Role role_worker_mapping = { - Role.Actor: ray.remote(actor_rollout_cls), - Role.Rollout: ray.remote(RolloutWorker), + Role.Actor: ray.remote(DetachActorWorker), + Role.Rollout: ray.remote( + DetachAsyncRolloutWorker if config.actor_rollout_ref.rollout.mode == "async" else DetachRolloutWorker), Role.Critic: ray.remote(CriticWorker), } diff --git a/recipe/one_step_off_policy/megatron_workers.py b/recipe/one_step_off_policy/megatron_workers.py index f7b58405b4f..9011f5a6023 100644 --- a/recipe/one_step_off_policy/megatron_workers.py +++ b/recipe/one_step_off_policy/megatron_workers.py @@ -27,42 +27,18 @@ from verl.utils.device import get_device_name, get_torch_device from verl.utils.fs import copy_to_local from verl.utils.vllm_utils import patch_vllm_moe_model_weight_loader -from verl.workers.megatron_workers import ActorRolloutRefWorker as ARRWorker +from verl.workers.megatron_workers import ActorRolloutRefWorker, AsyncActorRolloutRefWorker from verl.workers.megatron_workers import CriticWorker, RewardModelWorker logger = logging.getLogger(__file__) logger.setLevel(os.getenv("VERL_LOGGING_LEVEL", "WARN")) -__all__ = ["ActorRolloutRefWorker", "AsyncActorRolloutRefWorker", "CriticWorker", "RewardModelWorker", "RolloutWorker"] +__all__ = ["DetachActorWorker", "DetachRolloutWorker", "DetachAsyncRolloutWorker", "CriticWorker"] -class ActorRolloutRefWorker(ARRWorker): - def __init__(self, config: DictConfig, role: str): - assert role in ["actor", "ref"] - tmp_role = "ref" if role == "ref" else "actor_rollout" - super().__init__(config, tmp_role) - if role == "actor": - self._is_rollout = False - self.role = role - +class DetachNcclSync(ActorRolloutRefWorker): def _get_actor_params_generator(self): - assert self._is_actor - from verl.models.mcore import get_mcore_weight_converter - from verl.utils.megatron_utils import per_tensor_generator - - layer_name_mapping = { - "qkv_layer_name": "self_attention.linear_qkv.", - "gate_proj_layer_name": "linear_fc1.", - } - weight_converter = get_mcore_weight_converter(self.actor_model_config, self.dtype) - generator = per_tensor_generator( - self.actor.actor_module, - self.actor_model_config, - weight_converter, - self.tf_config, - layer_name_mapping, - ) - return generator + pass @register(dispatch_mode=Dispatch.ONE_TO_ALL, blocking=False) def sync_rollout_weights(self): @@ -106,11 +82,28 @@ def get_actor_weights_info(self): return ret -class RolloutWorker(ActorRolloutRefWorker): - def __init__(self, config: DictConfig, role: str): - assert role == "rollout" - ARRWorker.__init__(self, config, role) +class DetachActorWorker(DetachNcclSync): + def _get_actor_params_generator(self): + assert self._is_actor + from verl.models.mcore import get_mcore_weight_converter + from verl.utils.megatron_utils import per_tensor_generator + layer_name_mapping = { + "qkv_layer_name": "self_attention.linear_qkv.", + "gate_proj_layer_name": "linear_fc1.", + } + weight_converter = get_mcore_weight_converter(self.actor_model_config, self.dtype) + generator = per_tensor_generator( + self.actor.actor_module, + self.actor_model_config, + weight_converter, + self.tf_config, + layer_name_mapping, + ) + return generator + + +class DetachRolloutWorker(DetachNcclSync): @register(dispatch_mode=Dispatch.ONE_TO_ALL) def init_model(self): if self.config.model.get("external_lib", None) is not None: @@ -142,12 +135,9 @@ def init_model(self): from torch.distributed.device_mesh import init_device_mesh assert self.config.rollout.name == "vllm" - assert self.config.rollout.mode == "sync" from verl.workers.rollout.vllm_rollout import vLLMRollout - from .vllm_sharding_manager import VLLMShardingManager - # NOTE(sgm): If the QKV and gate_up projection layer are concate together in actor, # we will reorganize their weight format when resharding from actor to rollout. @@ -175,14 +165,16 @@ def init_model(self): ) log_gpu_memory_usage("After building vllm rollout", logger=logger) - sharding_manager = VLLMShardingManager( - inference_engine=rollout.inference_engine, - device_mesh=rollout_device_mesh, + from sharding_manager import DetachShardingManager + rollout_sharding_manager = DetachShardingManager( + inference_engine=rollout.inference_engine, device_mesh=rollout_device_mesh ) + log_gpu_memory_usage("After building sharding manager", logger=logger) - self.rollout, self.sharding_manager = rollout, sharding_manager - self.rollout.sharding_manager = sharding_manager + self.rollout = rollout + self.sharding_manager = rollout_sharding_manager + self.rollout.sharding_manager = rollout_sharding_manager @register(dispatch_mode=Dispatch.DP_COMPUTE_PROTO, blocking=False) def async_generate_sequences(self, *args, **kwargs): @@ -194,6 +186,11 @@ def set_actor_weights_info(self, weights_info): self._weights_info = weights_info -class AsyncActorRolloutRefWorker(ActorRolloutRefWorker): - def __init__(self, *args, **kwargs): - raise NotImplementedError +class DetachAsyncRolloutWorker(AsyncActorRolloutRefWorker, DetachRolloutWorker): + def __init__(self, config: DictConfig, role: str): + print(DetachAsyncRolloutWorker.__mro__) + DetachRolloutWorker.__init__(self, config, role) + + @register(dispatch_mode=Dispatch.ONE_TO_ALL) + def init_model(self): + DetachRolloutWorker.init_model(self) \ No newline at end of file diff --git a/recipe/one_step_off_policy/vllm_sharding_manager.py b/recipe/one_step_off_policy/sharding_manager.py similarity index 94% rename from recipe/one_step_off_policy/vllm_sharding_manager.py rename to recipe/one_step_off_policy/sharding_manager.py index c33ba585470..bc3dae69031 100644 --- a/recipe/one_step_off_policy/vllm_sharding_manager.py +++ b/recipe/one_step_off_policy/sharding_manager.py @@ -30,14 +30,14 @@ logger.setLevel(os.getenv("VERL_LOGGING_LEVEL", "WARN")) -class VLLMShardingManager(BaseShardingManager): +class DetachShardingManager(BaseShardingManager): @check_device_is_available() def __init__(self, inference_engine, device_mesh: DeviceMesh): self.device_mesh = device_mesh self.inference_engine = inference_engine - inference_engine.wake_up() - assert device_mesh is not None - assert inference_engine is not None + # inference_engine.wake_up() + # assert device_mesh is not None + # assert inference_engine is not None self.tp_size = self.device_mesh["infer_tp"].size() self.tp_rank = self.device_mesh["infer_tp"].get_local_rank() self.timing = {} diff --git a/verl/experimental/agent_loop/agent_loop.py b/verl/experimental/agent_loop/agent_loop.py index b658526b7d7..29f2b30edb7 100644 --- a/verl/experimental/agent_loop/agent_loop.py +++ b/verl/experimental/agent_loop/agent_loop.py @@ -478,7 +478,7 @@ def __init__(self, config: DictConfig, worker_group: RayWorkerGroup): Args: config (DictConfig): trainer config. - worker_group (RayWorkerGroup): ActorRolloutRef worker group. + worker_group (RayWorkerGroup): AsyncActorRolloutRefWorker worker group. """ self.config = config self.worker_group = worker_group diff --git a/verl/trainer/ppo/ray_trainer.py b/verl/trainer/ppo/ray_trainer.py index d16c2736bcc..60621021b30 100644 --- a/verl/trainer/ppo/ray_trainer.py +++ b/verl/trainer/ppo/ray_trainer.py @@ -438,15 +438,15 @@ def _validate_config(self): megatron_dp = n_gpus // ( model_parallel_size * config.actor_rollout_ref.actor.megatron.context_parallel_size ) - minimal_bsz = megatron_dp * config.actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu + self.minimal_bsz = megatron_dp * config.actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu else: - minimal_bsz = n_gpus + self.minimal_bsz = n_gpus # 1. Check total batch size for data correctness real_train_batch_size = config.data.train_batch_size * config.actor_rollout_ref.rollout.n - assert real_train_batch_size % minimal_bsz == 0, ( + assert real_train_batch_size % self.minimal_bsz == 0, ( f"real_train_batch_size ({real_train_batch_size}) must be divisible by minimal possible batch size " - f"({minimal_bsz})" + f"({self.minimal_bsz})" ) # A helper function to check "micro_batch_size" vs "micro_batch_size_per_gpu" diff --git a/verl/workers/rollout/vllm_rollout/vllm_async_server.py b/verl/workers/rollout/vllm_rollout/vllm_async_server.py index 988dac407d7..8c0d608871f 100644 --- a/verl/workers/rollout/vllm_rollout/vllm_async_server.py +++ b/verl/workers/rollout/vllm_rollout/vllm_async_server.py @@ -57,6 +57,12 @@ def _get_model_runner_workers(vllm_config, init_ray: bool = True): actor_name for actor_name in ray.util.list_named_actors() if actor_name.startswith(f"{wg_prefix}WorkerDict") ] + print(f"namespace: {namespace}") + print(f"wg_prefix: {wg_prefix}") + print(f"vllm_dp_size: {vllm_dp_size}") + print(f"vllm_dp_rank: {vllm_dp_rank}") + print(f"actor_names: {actor_names}") + vllm_tp_size = vllm_config.parallel_config.tensor_parallel_size assert len(actor_names) == vllm_dp_size * vllm_tp_size, ( f"instance_id: {vllm_config.instance_id} has {len(actor_names)} actors, but vllm_dp_size: " @@ -84,6 +90,7 @@ class ExternalRayDistributedExecutor(Executor): uses_ray: bool = False def _init_executor(self) -> None: + print("[ExternalRayDistributedExecutor] Initializing ray actors...") self.workers = _get_model_runner_workers(vllm_config=self.vllm_config, init_ray=True) kwargs = dict( @@ -93,10 +100,11 @@ def _init_executor(self) -> None: distributed_init_method="env://", is_driver_worker=True, ) + print(f"ray start instance_id: {self.vllm_config.instance_id} initializes") self.collective_rpc("init_worker", args=([kwargs],)) self.collective_rpc("init_device") self.collective_rpc("load_model") - print(f"instance_id: {self.vllm_config.instance_id} initializes finished.") + print(f"ray instance_id: {self.vllm_config.instance_id} initializes finished.") def collective_rpc( self, @@ -128,6 +136,7 @@ class ExternalZeroMQDistributedExecutor(Executor): uses_ray: bool = False def _init_executor(self) -> None: + print(f"[ExternalZeroMQDistributedExecutor] Initializing ray actors...") addresses = os.environ["VERL_VLLM_ZMQ_ADDRESSES"].split(",") self.context = zmq.Context() self.sockets = [] @@ -143,9 +152,11 @@ def _init_executor(self) -> None: distributed_init_method="env://", is_driver_worker=True, ) + print(f"ZeroMQ start instance_id: {self.vllm_config.instance_id} initializes") self.collective_rpc("init_worker", args=([kwargs],)) self.collective_rpc("init_device") self.collective_rpc("load_model") + print(f"ZeroMQ instance_id: {self.vllm_config.instance_id} initializes finished.") def collective_rpc( self, @@ -264,8 +275,12 @@ async def init_engine(self): # init async llm engine vllm_config = self._create_engine_config(engine_args) + + print(f"AsyncvLLMServer AsyncLLM.from_vllm_config {vllm_config}") self.engine = AsyncLLM.from_vllm_config(vllm_config) + print("AsyncvLLMServer build serving chat") + # build serving chat model_config = self.engine.model_config BASE_MODEL_PATHS = [BaseModelPath(name=model_name, model_path=model_path)] @@ -282,6 +297,8 @@ async def init_engine(self): tool_parser=config.multi_turn.format, # hermes, llama3_json, ... ) + print("AsyncvLLMServer init_engine success") + def _create_engine_config(self, engine_args: AsyncEngineArgs): vllm_config = engine_args.create_engine_config() namespace = ray.get_runtime_context().namespace diff --git a/verl/workers/rollout/vllm_rollout/vllm_rollout_spmd.py b/verl/workers/rollout/vllm_rollout/vllm_rollout_spmd.py index 5bd571016ac..307e7e77036 100644 --- a/verl/workers/rollout/vllm_rollout/vllm_rollout_spmd.py +++ b/verl/workers/rollout/vllm_rollout/vllm_rollout_spmd.py @@ -58,6 +58,7 @@ logger = logging.getLogger(__file__) logger.setLevel(os.getenv("VERL_LOGGING_LEVEL", "WARN")) + # TODO # 1. support pp in vllm # 2. passing tokenizer is not necessary? no encoding/decoding is happending here @@ -108,11 +109,11 @@ def __init__(self, model_path: str, config: DictConfig, tokenizer, model_hf_conf if hasattr(model_hf_config, "max_position_embeddings"): max_position_embeddings = model_hf_config.max_position_embeddings elif hasattr(model_hf_config, "llm_config") and hasattr( - model_hf_config.llm_config, "max_position_embeddings" + model_hf_config.llm_config, "max_position_embeddings" ): max_position_embeddings = model_hf_config.llm_config.max_position_embeddings elif hasattr(model_hf_config, "text_config") and hasattr( - model_hf_config.text_config, "max_position_embeddings" + model_hf_config.text_config, "max_position_embeddings" ): max_position_embeddings = model_hf_config.text_config.max_position_embeddings if max_position_embeddings is None: @@ -127,12 +128,12 @@ def __init__(self, model_path: str, config: DictConfig, tokenizer, model_hf_conf rope_scaling_factor = rope_scaling_config.get("factor", 1.0) assert ( - model_hf_config.max_position_embeddings * rope_scaling_factor - >= config.prompt_length + config.response_length + model_hf_config.max_position_embeddings * rope_scaling_factor + >= config.prompt_length + config.response_length ), ( - "model context length should be greater than total sequence length, " - + f"got rope_scaling_factor={rope_scaling_factor} and " - + f"max_position_embeddings={model_hf_config.max_position_embeddings}" + "model context length should be greater than total sequence length, " + + f"got rope_scaling_factor={rope_scaling_factor} and " + + f"max_position_embeddings={model_hf_config.max_position_embeddings}" ) max_model_len = int(config.max_model_len or config.prompt_length + config.response_length) @@ -267,7 +268,7 @@ def generate_sequences(self, prompts: DataProto, **kwargs) -> DataProto: if "multi_modal_data" in non_tensor_batch: vllm_inputs = [] for raw_prompt_ids, multi_modal_data in zip( - non_tensor_batch.pop("raw_prompt_ids"), non_tensor_batch.pop("multi_modal_data"), strict=True + non_tensor_batch.pop("raw_prompt_ids"), non_tensor_batch.pop("multi_modal_data"), strict=True ): vllm_inputs.append({"prompt_token_ids": raw_prompt_ids, "multi_modal_data": multi_modal_data}) else: @@ -389,9 +390,9 @@ def _monkey_patch_compute_logits(model, vocab_size: int): original_compute_logits = model.compute_logits def compute_logits( - self, - hidden_states: torch.Tensor, - sampling_metadata: SamplingMetadata, + self, + hidden_states: torch.Tensor, + sampling_metadata: SamplingMetadata, ) -> torch.Tensor: logits = original_compute_logits(hidden_states, sampling_metadata) logits[..., vocab_size:] = float("-inf") @@ -458,6 +459,8 @@ def get_zeromq_address(self): def init_worker(self, all_kwargs: list[dict[str, Any]]): """Initialize worker engine.""" + + print("[vLLMAsyncRollout] init_worker") all_kwargs[0]["rank"] = int(os.environ["RANK"]) all_kwargs[0]["local_rank"] = 0 @@ -468,6 +471,8 @@ def init_worker(self, all_kwargs: list[dict[str, Any]]): def load_model(self, *args, **kwargs): self.inference_engine.load_model(*args, **kwargs) + print(f"[vLLMAsyncRollout] load_model {args} {kwargs}") + # inference engine is initialized now, update sharding manager self.sharding_manager.inference_engine = self.inference_engine self.sharding_manager.model_runner = self.inference_engine.worker.model_runner From 6ddb460a9507bc74d1b4db2afe1d89c66f9c291c Mon Sep 17 00:00:00 2001 From: ArronHZG Date: Thu, 14 Aug 2025 21:26:07 +0800 Subject: [PATCH 046/182] update shell --- tests/special_e2e/run_fully_async_policy.sh | 19 ++++++++++++++++--- 1 file changed, 16 insertions(+), 3 deletions(-) diff --git a/tests/special_e2e/run_fully_async_policy.sh b/tests/special_e2e/run_fully_async_policy.sh index 27c033abc1d..5662d6cb479 100644 --- a/tests/special_e2e/run_fully_async_policy.sh +++ b/tests/special_e2e/run_fully_async_policy.sh @@ -13,6 +13,14 @@ MODEL_ID=${MODEL_ID:-Qwen/Qwen2.5-0.5B-Instruct} MODEL_PATH=${MODEL_PATH:-${HOME}/models/${MODEL_ID}} huggingface-cli download "${MODEL_ID}" --local-dir "${MODEL_PATH}" + +rollout_mode="async" +rollout_name="vllm" # sglang or vllm +if [ "$rollout_mode" = "async" ]; then + export VLLM_USE_V1=1 + return_raw_chat="True" +fi + # Algorithm parameters adv_estimator=grpo @@ -33,11 +41,13 @@ overlong_penalty_factor=1.0 # Training parameters loss_agg_mode="token-mean" -train_prompt_bsz=2 -gen_prompt_bsz=2 +train_prompt_bsz=0 +gen_prompt_bsz=1 n_resp_per_prompt=3 train_prompt_mini_bsz=1 +total_rollout_steps=10 + # Temperature parameters temperature=1.0 top_p=1.0 @@ -67,6 +77,7 @@ common_params=( data.max_response_length=${max_response_length} data.train_batch_size=${train_prompt_bsz} data.gen_batch_size=${gen_prompt_bsz} + data.return_raw_chat=${return_raw_chat} actor_rollout_ref.rollout.n=${n_resp_per_prompt} algorithm.adv_estimator=${adv_estimator} algorithm.use_kl_in_reward=${use_kl_in_reward} @@ -95,6 +106,8 @@ common_params=( actor_rollout_ref.rollout.val_kwargs.do_sample=True actor_rollout_ref.rollout.val_kwargs.n=1 actor_rollout_ref.rollout.enable_chunked_prefill=True + actor_rollout_ref.rollout.name=${rollout_name} + actor_rollout_ref.rollout.mode=${rollout_mode} reward_model.reward_manager=dapo +reward_model.reward_kwargs.overlong_buffer_cfg.enable=${enable_overlong_buffer} +reward_model.reward_kwargs.overlong_buffer_cfg.len=${overlong_buffer_len} @@ -112,7 +125,7 @@ common_params=( trainer.n_gpus_per_node=${n_gpus_training} rollout.nnodes=1 rollout.n_gpus_per_node=${n_gpus_rollout} - rollout.total_rollout_steps=10 + rollout.total_rollout_steps=${total_rollout_steps} rollout.total_epochs=2 # Fully async specific configurations async_training.staleness_threshold=${staleness_threshold} From 12edb900e41c67b7a19ffd46e712a3bb095d1797 Mon Sep 17 00:00:00 2001 From: ArronHZG Date: Thu, 14 Aug 2025 23:41:58 +0800 Subject: [PATCH 047/182] stream rollout --- recipe/fully_async_policy/fully_async_main.py | 28 ++++++------ .../fully_async_rollouter.py | 36 +++++++-------- .../fully_async_policy/fully_async_trainer.py | 9 ++-- recipe/fully_async_policy/message_queue.py | 4 ++ ..._manager.py => detach_sharding_manager.py} | 10 +++-- recipe/one_step_off_policy/fsdp_workers.py | 45 ++++++++++++++++--- tests/special_e2e/run_fully_async_policy.sh | 2 +- 7 files changed, 88 insertions(+), 46 deletions(-) rename recipe/one_step_off_policy/{sharding_manager.py => detach_sharding_manager.py} (92%) diff --git a/recipe/fully_async_policy/fully_async_main.py b/recipe/fully_async_policy/fully_async_main.py index 0e43bd6151b..20234847ea8 100644 --- a/recipe/fully_async_policy/fully_async_main.py +++ b/recipe/fully_async_policy/fully_async_main.py @@ -177,23 +177,22 @@ def _initialize_components(self, config) -> None: self.components["reward_fn"] = reward_fn self.components["val_reward_fn"] = val_reward_fn - self.max_queue_size = ((config.async_training.staleness_threshold + 1) - * config.data.train_batch_size - * config.actor_rollout_ref.rollout.n - ) * 10 # x 10 avoid deadlock - print("[ASYNC MAIN] Creating MessageQueue...") - message_queue = MessageQueue.remote(config, self.max_queue_size) - message_queue_client = MessageQueueClient(message_queue) - - self.components["message_queue"] = message_queue - self.components["message_queue_client"] = message_queue_client - print("[ASYNC MAIN] Creating FullyAsyncRollouter...") self._create_rollouter(config) print("[ASYNC MAIN] Creating FullyAsyncTrainer...") self._create_trainer(config) + print("[ASYNC MAIN] Creating MessageQueue...") + max_queue_size = ray.get(self.components["rollouter"].get_max_queue_size.remote()) + message_queue = MessageQueue.remote(config, max_queue_size) + message_queue_client = MessageQueueClient(message_queue) + self.components["message_queue"] = message_queue + self.components["message_queue_client"] = message_queue_client + + ray.get(self.components["rollouter"].set_message_queue_client.remote(self.components["message_queue_client"])) + ray.get(self.components["trainer"].set_message_queue_client.remote(self.components["message_queue_client"])) + print("[ASYNC MAIN] Setting up parameter synchronization...") from recipe.fully_async_policy.param_sync import ParameterSynchronizer @@ -220,12 +219,10 @@ def _create_rollouter(self, config) -> None: resource_pool_manager=create_resource_pool_manager(config, roles=[Role.Rollout]), ray_worker_group_cls=self.components["ray_worker_group_cls"], processor=self.components["processor"], - device_name=config.trainer.device, - max_queue_size=self.max_queue_size, + device_name=config.trainer.device ) ray.get(rollouter.init_workers.remote()) - ray.get(rollouter.set_message_queue_client.remote(self.components["message_queue_client"])) self.components["rollouter"] = rollouter print("[ASYNC MAIN] Rollouter created and initialized successfully") @@ -249,7 +246,6 @@ def _create_trainer(self, config) -> None: ) ray.get(trainer.init_workers.remote()) - ray.get(trainer.set_message_queue_client.remote(self.components["message_queue_client"])) self.components["trainer"] = trainer print("[ASYNC MAIN] FullyAsyncTrainer created and initialized successfully") @@ -279,3 +275,5 @@ def main(config): if __name__ == "__main__": main() + + diff --git a/recipe/fully_async_policy/fully_async_rollouter.py b/recipe/fully_async_policy/fully_async_rollouter.py index 5f6f2b0d589..a31d903d43a 100644 --- a/recipe/fully_async_policy/fully_async_rollouter.py +++ b/recipe/fully_async_policy/fully_async_rollouter.py @@ -43,8 +43,7 @@ def __init__( processor=None, reward_fn=None, val_reward_fn=None, - device_name=None, - max_queue_size=1000, + device_name=None ): # Store the tokenizer for text processing self.tokenizer = tokenizer @@ -59,7 +58,6 @@ def __init__( assert self.config.data.train_batch_size == 0, "train_batch_size must be zero" assert self.config.data.gen_batch_size == 1, "gen_batch_size must be one" - self.role_worker_mapping = role_worker_mapping self.resource_pool_manager = resource_pool_manager self.ray_worker_group_cls = ray_worker_group_cls @@ -125,9 +123,6 @@ def __init__( # Parameter synchronization related self.param_synchronizer = None - # queue size - self.max_queue_size = max_queue_size - self.async_rollout_manager = None # 流式处理相关配置 @@ -144,6 +139,8 @@ def __init__( config.actor_rollout_ref.actor.ppo_mini_batch_size) self.max_required_samples = self.required_samples * (self.staleness_threshold + 1) + # queue size + self.max_queue_size = self.max_required_samples * 10 # x 10 avoid deadlock async def set_message_queue_client(self, message_queue_client: MessageQueueClient): """Set message queue client""" @@ -159,6 +156,9 @@ def get_rollout_wg(self): """Get rollout worker group""" return self.rollout_wg + def get_max_queue_size(self): + return self.max_queue_size + async def update_param_version(self, version: int): """Update current parameter version""" async with self.lock: @@ -227,7 +227,7 @@ async def _feed_samples(self): # 检查是否到达最后一步 if self.global_steps >= self.total_rollout_steps: - print("[FullyAsyncRollouter] 达到最大步数,停止添加新样本") + print(f"[FullyAsyncRollouter] 达到最大步数,停止添加新样本 {self.global_steps} >= {self.total_rollout_steps}") break self.global_steps += 1 @@ -334,7 +334,7 @@ async def _consumer_worker(self): data=result["agent_loop_output"], # 直接存储 AgentLoopOutput rollout_metadata=rollout_metadata, ) - success = self.message_queue_client.put_sample( + success = await self.message_queue_client.put_sample( sample=ray.cloudpickle.dumps(queue_sample), param_version=result["param_version"], ) @@ -432,13 +432,16 @@ async def _streaming_generation_main(self): self.running = False # 发送终止信号 - self.message_queue_client.put_sample( + await self.message_queue_client.put_sample( sample=None, param_version=self.current_param_version, ) - def fit(self): - """Start the async rollouter - entry point that sets up and runs async tasks""" + async def fit(self): + """ + Start the async rollouter - entry point that sets up and runs async tasks + Main async fit method that coordinates all coroutines""" + print("[FullyAsyncRollouter] Starting FullyAsyncRollouter...") if self.message_queue_client is None: @@ -446,11 +449,6 @@ def fit(self): if self.param_synchronizer is None: raise ValueError("param_synchronizer client not set. Call set_parameter_synchronizer() first.") - # Run everything in a single async event loop - asyncio.run(self._async_fit()) - - async def _async_fit(self): - """Main async fit method that coordinates all coroutines""" # 设置运行状态 async with self.lock: self.running = True @@ -506,7 +504,7 @@ async def _async_monitor_loop(self): async def _should_pause_generation(self) -> bool: """Determine whether the build should be paused""" - queue_stats = self.message_queue_client.get_statistics() + queue_stats = self.message_queue_client.get_statistics_sync() queue_size = queue_stats["queue_size"] current_trainer_version = queue_stats["current_param_version"] @@ -571,7 +569,7 @@ async def resume(self) -> bool: async def get_statistics(self) -> dict: async with self.lock: - queue_stats = self.message_queue_client.get_statistics() + queue_stats = self.message_queue_client.get_statistics_sync() stats = { "is_running": self.running, "total_generated_samples": self.total_generated_samples, @@ -587,3 +585,5 @@ async def get_statistics(self) -> dict: } return stats + + diff --git a/recipe/fully_async_policy/fully_async_trainer.py b/recipe/fully_async_policy/fully_async_trainer.py index 21b2eda259e..7b1c725f667 100644 --- a/recipe/fully_async_policy/fully_async_trainer.py +++ b/recipe/fully_async_policy/fully_async_trainer.py @@ -139,7 +139,7 @@ def _get_samples_from_queue(self) -> tuple[None, None] | tuple[int, Any]: while len(queue_samples) < self.required_samples: # 获取单个样本,会一直等待直到有样本或收到None - sample = self.message_queue_client.get_sample() + sample = self.message_queue_client.get_sample_sync() if sample is None: # 检测到结束信号(None),立即退出 @@ -202,7 +202,6 @@ def _assemble_gen_batch_output_from_queue_samples(self, queue_samples: list[Queu processing_times.append(sample.rollout_metadata.get("processing_time", 0)) # Use the static method to postprocess AgentLoopOutput list into DataProto - from verl.experimental.agent_loop.agent_loop import AgentLoopWorker batch = postprocess_agent_loop_outputs(agent_loop_outputs, self.tokenizer, self.config) @@ -299,6 +298,9 @@ def _init_models(self): self.actor_wg.init_model() self.actor_rollout_wg = self.actor_wg # to be compatible with the functions that not be modified + def _init_async_rollout_manager(self): + pass + def fit(self): """ The training loop of PPO. @@ -385,7 +387,7 @@ def fit(self): def get_statistics(self) -> dict: """Get training statistics""" - queue_stats = self.message_queue_client.get_statistics() if self.message_queue_client else {} + queue_stats = self.message_queue_client.get_statistics_sync() if self.message_queue_client else {} return { "global_steps": self.global_steps, "processed_samples": self.processed_samples, @@ -460,3 +462,4 @@ def _compute_sample_freshness_metrics(self, batch_samples: list[QueueSample]) -> except Exception as e: logger.error(f"Error computing freshness metrics: {e}") return {"freshness/error": str(e)} + diff --git a/recipe/fully_async_policy/message_queue.py b/recipe/fully_async_policy/message_queue.py index 2e8ad6b0e79..9a093296743 100644 --- a/recipe/fully_async_policy/message_queue.py +++ b/recipe/fully_async_policy/message_queue.py @@ -311,6 +311,10 @@ def get_samples_sync(self, min_batch_count: int = 1) -> tuple[list[Any], int]: """Get batch from queue (sync - deprecated, use get_samples instead)""" return ray.get(self.queue_actor.get_samples.remote(min_batch_count)) + def get_sample_sync(self) -> Any | None: + """Get single sample from queue (sync - deprecated, use get_sample instead)""" + return ray.get(self.queue_actor.get_sample.remote()) + def get_statistics_sync(self) -> dict[str, Any]: """Get statistics (sync - deprecated, use get_statistics instead)""" return ray.get(self.queue_actor.get_statistics.remote()) diff --git a/recipe/one_step_off_policy/sharding_manager.py b/recipe/one_step_off_policy/detach_sharding_manager.py similarity index 92% rename from recipe/one_step_off_policy/sharding_manager.py rename to recipe/one_step_off_policy/detach_sharding_manager.py index bc3dae69031..6b304baa276 100644 --- a/recipe/one_step_off_policy/sharding_manager.py +++ b/recipe/one_step_off_policy/detach_sharding_manager.py @@ -47,12 +47,14 @@ def __init__(self, inference_engine, device_mesh: DeviceMesh): @GPUMemoryLogger(role="vllm sharding_manager", logger=logger) def __enter__(self): - get_torch_device().set_rng_state(self.gen_random_states) + # get_torch_device().set_rng_state(self.gen_random_states) + pass @GPUMemoryLogger(role="vllm sharding_manager", logger=logger) def __exit__(self, exc_type, exc_value, traceback): - self.gen_random_states = get_torch_device().get_rng_state() - self.inference_engine.reset_prefix_cache() + # self.gen_random_states = get_torch_device().get_rng_state() + # self.inference_engine.reset_prefix_cache() + pass @GPUMemoryLogger(role="vllm sharding_manager", logger=logger) def preprocess_data(self, data: DataProto) -> DataProto: @@ -72,3 +74,5 @@ def postprocess_data(self, data: DataProto) -> DataProto: return data return data.chunk(chunks=self.tp_size)[self.tp_rank] + + diff --git a/recipe/one_step_off_policy/fsdp_workers.py b/recipe/one_step_off_policy/fsdp_workers.py index e6ab9d1c241..04c66c8b60a 100644 --- a/recipe/one_step_off_policy/fsdp_workers.py +++ b/recipe/one_step_off_policy/fsdp_workers.py @@ -50,6 +50,32 @@ __all__ = ["DetachActorWorker", "DetachRolloutWorker", "DetachAsyncRolloutWorker", "CriticWorker"] +def get_inference_model(rollout): + """ + 根据不同类型的inference_engine获取模型对象 + Args: + rollout: rollout对象,包含inference_engine + Returns: + model: 模型对象 + """ + inference_engine = rollout.inference_engine + # 判断inference_engine的类型 + if hasattr(inference_engine, 'llm_engine'): + # LLM类型 - vLLMRollout + inference_model = ( + inference_engine.llm_engine.model_executor.driver_worker.worker.model_runner.model + ) + elif hasattr(inference_engine, 'worker'): + # WorkerWrapperBase类型 - vLLMAsyncRollout + inference_model = inference_engine.worker.model_runner.model + else: + raise AttributeError( + f"Unsupported inference_engine type: {type(inference_engine)}. " + f"Expected LLM (with llm_engine attribute) or WorkerWrapperBase (with worker attribute)." + ) + return inference_model + + class DetachNcclSync(ActorRolloutRefWorker): def _get_actor_params(self): @@ -62,9 +88,7 @@ def sync_rollout_weights(self): params = self._get_actor_params() if self._is_actor else None if self._is_rollout: - inference_model = ( - self.rollout.inference_engine.llm_engine.model_executor.driver_worker.worker.model_runner.model - ) + inference_model = get_inference_model(self.rollout) patch_vllm_moe_model_weight_loader(inference_model) for key, shape, dtype in self._weights_info: tensor = torch.empty(shape, dtype=dtype, device=get_torch_device().current_device()) @@ -209,15 +233,15 @@ def init_model(self): ) log_gpu_memory_usage(f"After building {rollout_name} rollout", logger=logger) - from sharding_manager import DetachShardingManager - rollout_sharding_manager = DetachShardingManager( + from .detach_sharding_manager import DetachShardingManager + sharding_manager = DetachShardingManager( inference_engine=rollout.inference_engine, device_mesh=rollout_device_mesh ) log_gpu_memory_usage("After building sharding manager", logger=logger) self.rollout = rollout - self.rollout_sharding_manager = rollout_sharding_manager + self.rollout_sharding_manager = sharding_manager @register(dispatch_mode=Dispatch.DP_COMPUTE_PROTO, blocking=False) def async_generate_sequences(self, *args, **kwargs): @@ -238,3 +262,12 @@ def __init__(self, config: DictConfig, role: str): def init_model(self): print(f"[DetachAsyncRolloutWorker] init_model") DetachRolloutWorker.init_model(self) + + self.vllm_tp_size = self.config.rollout.tensor_model_parallel_size + self.vllm_dp_rank = int(os.environ["RANK"]) // self.vllm_tp_size + self.vllm_tp_rank = int(os.environ["RANK"]) % self.vllm_tp_size + + # used for sleep/wake_up + self.rollout.sharding_manager = self.rollout_sharding_manager + + diff --git a/tests/special_e2e/run_fully_async_policy.sh b/tests/special_e2e/run_fully_async_policy.sh index 5662d6cb479..337f2991a16 100644 --- a/tests/special_e2e/run_fully_async_policy.sh +++ b/tests/special_e2e/run_fully_async_policy.sh @@ -46,7 +46,7 @@ gen_prompt_bsz=1 n_resp_per_prompt=3 train_prompt_mini_bsz=1 -total_rollout_steps=10 +total_rollout_steps=1000 # Temperature parameters temperature=1.0 From efa664073cc151a7b1272b509323f511b6bef03b Mon Sep 17 00:00:00 2001 From: ArronHZG Date: Fri, 15 Aug 2025 01:03:25 +0800 Subject: [PATCH 048/182] RolloutSample --- recipe/fully_async_policy/fully_async_main.py | 12 +- .../fully_async_rollouter.py | 199 ++++++++++++------ .../fully_async_policy/fully_async_trainer.py | 198 +++++++++-------- recipe/fully_async_policy/message_queue.py | 36 +++- recipe/fully_async_policy/utils.py | 1 + .../detach_sharding_manager.py | 2 - recipe/one_step_off_policy/fsdp_workers.py | 17 +- recipe/one_step_off_policy/main_ppo.py | 21 +- .../one_step_off_policy/megatron_workers.py | 10 +- 9 files changed, 287 insertions(+), 209 deletions(-) diff --git a/recipe/fully_async_policy/fully_async_main.py b/recipe/fully_async_policy/fully_async_main.py index 20234847ea8..1d4e64b1ca4 100644 --- a/recipe/fully_async_policy/fully_async_main.py +++ b/recipe/fully_async_policy/fully_async_main.py @@ -82,21 +82,23 @@ def create_role_worker_mapping(config): if config.actor_rollout_ref.actor.strategy == "fsdp2": assert config.actor_rollout_ref.actor.strategy == config.critic.strategy from recipe.one_step_off_policy.fsdp_workers import ( + CriticWorker, DetachActorWorker, DetachAsyncRolloutWorker, - CriticWorker, ) from verl.single_controller.ray import RayWorkerGroup + ray_worker_group_cls = RayWorkerGroup elif config.actor_rollout_ref.actor.strategy == "megatron": assert config.actor_rollout_ref.actor.strategy == config.critic.strategy from recipe.one_step_off_policy.megatron_workers import ( + CriticWorker, DetachActorWorker, DetachAsyncRolloutWorker, - CriticWorker, ) from verl.single_controller.ray.megatron import NVMegatronRayWorkerGroup + ray_worker_group_cls = NVMegatronRayWorkerGroup else: @@ -120,7 +122,7 @@ def create_role_worker_mapping(config): # 添加reference policy(如果需要KL loss或reward) if config.algorithm.use_kl_in_reward or config.actor_rollout_ref.actor.use_kl_loss: - role_worker_mapping[Role.RefPolicy] = ray.remote(ActorRolloutRefWorker) + role_worker_mapping[Role.RefPolicy] = ray.remote(DetachActorWorker) return role_worker_mapping, ray_worker_group_cls @@ -219,7 +221,7 @@ def _create_rollouter(self, config) -> None: resource_pool_manager=create_resource_pool_manager(config, roles=[Role.Rollout]), ray_worker_group_cls=self.components["ray_worker_group_cls"], processor=self.components["processor"], - device_name=config.trainer.device + device_name=config.trainer.device, ) ray.get(rollouter.init_workers.remote()) @@ -275,5 +277,3 @@ def main(config): if __name__ == "__main__": main() - - diff --git a/recipe/fully_async_policy/fully_async_rollouter.py b/recipe/fully_async_policy/fully_async_rollouter.py index a31d903d43a..2e729228153 100644 --- a/recipe/fully_async_policy/fully_async_rollouter.py +++ b/recipe/fully_async_policy/fully_async_rollouter.py @@ -18,8 +18,9 @@ import ray from omegaconf import OmegaConf -from recipe.fully_async_policy.message_queue import MessageQueueClient, QueueSample +from recipe.fully_async_policy.message_queue import MessageQueueClient, RolloutSample from recipe.fully_async_policy.utils import calculate_one_step_size +from verl import DataProto from verl.single_controller.ray import RayClassWithInitArgs, RayWorkerGroup from verl.trainer.ppo.ray_trainer import RayPPOTrainer, ResourcePoolManager, Role, WorkerType from verl.utils.tracking import ValidationGenerationsLogger @@ -34,16 +35,16 @@ class FullyAsyncRollouter(RayPPOTrainer): """ def __init__( - self, - config, - tokenizer, - role_worker_mapping: dict[Role, WorkerType], - resource_pool_manager: ResourcePoolManager, - ray_worker_group_cls: RayWorkerGroup = RayWorkerGroup, - processor=None, - reward_fn=None, - val_reward_fn=None, - device_name=None + self, + config, + tokenizer, + role_worker_mapping: dict[Role, WorkerType], + resource_pool_manager: ResourcePoolManager, + ray_worker_group_cls: RayWorkerGroup = RayWorkerGroup, + processor=None, + reward_fn=None, + val_reward_fn=None, + device_name=None, ): # Store the tokenizer for text processing self.tokenizer = tokenizer @@ -135,8 +136,9 @@ def __init__( self.queue_full_pause_count = 0 # 队列满导致的暂停次数 # Calculate the samples needed for a train, used to calculate staleness and interrupt rollout - self.required_samples = calculate_one_step_size(self.minimal_bsz, - config.actor_rollout_ref.actor.ppo_mini_batch_size) + self.required_samples = calculate_one_step_size( + self.minimal_bsz, config.actor_rollout_ref.actor.ppo_mini_batch_size + ) self.max_required_samples = self.required_samples * (self.staleness_threshold + 1) # queue size @@ -216,34 +218,105 @@ async def _feed_samples(self): continuous_iterator = self._create_continuous_iterator() sample_count = 0 for epoch, batch_dict in continuous_iterator: - # 准备样本数据 - sample_id = f"sample_{epoch}_{sample_count}" - batch, gen_batch = self._prepare_generate_batch(batch_dict) + # 类似 _prepare_generate_batch 的逻辑:分离数据 + original_batch, gen_data = self._prepare_single_generation_data(batch_dict) + + # 根据 rollout.n 进行重复 + n_repeats = self.config.actor_rollout_ref.rollout.n + + for rollout_n_index in range(n_repeats): + sample_id = f"sample_{epoch}_{sample_count}_{rollout_n_index}" + + partial_rollout_sample = RolloutSample( + original_batch_dict=original_batch, + agent_loop_output=None, # 待处理后填充 + sample_id=sample_id, + epoch=epoch, + rollout_n_index=rollout_n_index, + original_sample_index=sample_count, + processing_time=0.0, # 待处理后填充 + generation_timestamp=0.0, # 待处理后填充 + param_version=0, # 待处理后填充 + _gen_data=gen_data, # 临时字段,处理完后删除 + ) - sample_data = {"sample_id": sample_id, "gen_batch": gen_batch, "epoch": epoch, "timestamp": time.time()} + # 将生成数据附加到 RolloutSample 中(临时字段) - await self.pending_samples_queue.put(sample_data) - sample_count += 1 + await self.pending_samples_queue.put(partial_rollout_sample) - # 检查是否到达最后一步 - if self.global_steps >= self.total_rollout_steps: - print(f"[FullyAsyncRollouter] 达到最大步数,停止添加新样本 {self.global_steps} >= {self.total_rollout_steps}") - break + # 检查是否到达最后一步 + if self.global_steps >= self.total_rollout_steps: + print( + f"[FullyAsyncRollouter] 达到最大步数,停止添加新样本 " + f"{self.global_steps} >= {self.total_rollout_steps}" + ) + break + + self.global_steps += 1 - self.global_steps += 1 + sample_count += 1 # 发送结束信号 await self.pending_samples_queue.put("DONE") + def _prepare_single_generation_data(self, batch_dict): + """ + 类似 ray_trainer._prepare_generate_batch 的逻辑,但针对单个样本 + 分离出用于生成的数据和需要保留的原始数据 + + Returns: + tuple: (original_batch_dict, gen_data_for_single_sample) + """ + from verl import DataProto + + # 创建完整的 DataProto + full_batch = DataProto.from_single_dict(batch_dict) + + # 定义需要传递给生成服务器的字段 + batch_keys_to_pop = ["input_ids", "attention_mask", "position_ids"] + non_tensor_batch_keys_to_pop = ["raw_prompt_ids"] + + # 处理可选字段 + optional_fields = [ + "multi_modal_data", + "raw_prompt", + "tools_kwargs", + "interaction_kwargs", + "index", + "agent_name", + ] + + for field in optional_fields: + if field in full_batch.non_tensor_batch: + non_tensor_batch_keys_to_pop.append(field) + + # 分离数据:gen_batch 用于生成,original_batch 保留原始信息 + gen_batch = full_batch.pop( + batch_keys=batch_keys_to_pop, + non_tensor_batch_keys=non_tensor_batch_keys_to_pop, + ) + + # 添加全局步数到生成数据 + gen_batch.meta_info["global_steps"] = self.global_steps + + # 保留原始 batch 信息(转换为字典格式以便序列化) + original_batch_dict = { + "batch": {k: v.clone() if hasattr(v, "clone") else v for k, v in full_batch.batch.items()}, + "non_tensor_batch": dict(full_batch.non_tensor_batch), + "meta_info": dict(full_batch.meta_info), + } + + return original_batch_dict, gen_batch + async def _submit_worker(self): """流式处理工作协程 - 逐个样本立即提交处理,不等待批次""" active_tasks = set() while True: - # 获取待处理样本 - sample_data = await self.pending_samples_queue.get() + # 获取待处理的部分 RolloutSample + partial_rollout_sample = await self.pending_samples_queue.get() - if sample_data == "DONE": + if partial_rollout_sample == "DONE": print("收到结束信号,等待剩余任务完成...") # 等待所有活动任务完成 if active_tasks: @@ -261,41 +334,48 @@ async def _submit_worker(self): # 立即提交单个样本处理 task = asyncio.create_task( - self._process_single_sample_streaming(sample_data), name=f"process_{sample_data['sample_id']}" + self._process_single_sample_streaming(partial_rollout_sample), + name=f"process_{partial_rollout_sample.sample_id}", ) active_tasks.add(task) # 标记队列任务完成 self.pending_samples_queue.task_done() - async def _process_single_sample_streaming(self, sample_data: dict): + async def _process_single_sample_streaming(self, partial_rollout_sample): """流式处理单个样本""" # 检查是否需要暂停处理 if await self._should_pause_generation(): - print(f"[FullyAsyncRollouter] 暂停处理样本 {sample_data['sample_id']}") + print(f"[FullyAsyncRollouter] 暂停处理样本 {partial_rollout_sample.sample_id}") # 暂停时重新放回队列 - await self.pending_samples_queue.put(sample_data) + await self.pending_samples_queue.put(partial_rollout_sample) return start_time = time.time() - # 直接使用AgentLoopManager的单样本异步处理能力 + + # 从 RolloutSample 中提取生成数据(临时字段) + gen_data = partial_rollout_sample._gen_data + + # 将单个样本数据包装成 DataProto (用于 generate_single_sample_async) + gen_batch_single = DataProto.from_items([gen_data]) + + # 调用异步生成方法 agent_loop_output, processing_time = await self.async_rollout_manager.generate_single_sample_async( - sample_data["gen_batch"], sample_data["sample_id"] + gen_batch_single, partial_rollout_sample.sample_id ) end_time = time.time() - # 组装最终结果 - final_result = { - "sample_id": sample_data["sample_id"], - "agent_loop_output": agent_loop_output, - "processing_time": processing_time, - "timestamp": time.time(), - "param_version": self.current_param_version, - "epoch": sample_data["epoch"], - } + # 直接更新 RolloutSample 对象,填充剩余字段 + partial_rollout_sample.agent_loop_output = agent_loop_output + partial_rollout_sample.processing_time = processing_time + partial_rollout_sample.generation_timestamp = time.time() + partial_rollout_sample.param_version = self.current_param_version - # 立即放入结果队列 - await self.result_queue.put(final_result) + # 删除临时字段 + delattr(partial_rollout_sample, "_gen_data") + + # 直接放入结果队列 + await self.result_queue.put(partial_rollout_sample) async with self.lock: self.processed_sample_count += 1 @@ -304,7 +384,7 @@ async def _process_single_sample_streaming(self, sample_data: dict): self.max_processing_time = processing_time print( - f"[FullyAsyncRollouter] 样本 {sample_data['sample_id']} 处理完成," + f"[FullyAsyncRollouter] 样本 {partial_rollout_sample.sample_id} 处理完成," f"耗时 {processing_time:.2f}s {end_time - start_time:.2f}s" ) @@ -317,26 +397,13 @@ async def _consumer_worker(self): if self.result_queue.empty(): break - # 从结果队列获取处理结果 - result = await self.result_queue.get() - - # 准备rollout metadata - rollout_metadata = { - "generation_timestamp": result["timestamp"], - "rollout_param_version": result["param_version"], - "processing_time": result["processing_time"], - "epoch": result["epoch"], - "agent_loop_metrics": result["agent_loop_output"].metrics.model_dump(), - } + # 从结果队列获取 RolloutSample + rollout_sample = await self.result_queue.get() - # 直接将 AgentLoopOutput 放入消息队列 - queue_sample = QueueSample( - data=result["agent_loop_output"], # 直接存储 AgentLoopOutput - rollout_metadata=rollout_metadata, - ) + # 直接将 RolloutSample 放入消息队列 success = await self.message_queue_client.put_sample( - sample=ray.cloudpickle.dumps(queue_sample), - param_version=result["param_version"], + sample=ray.cloudpickle.dumps(rollout_sample), + param_version=rollout_sample.param_version, ) async with self.lock: @@ -347,9 +414,9 @@ async def _consumer_worker(self): self.dropped_stale_samples += 1 print( - f"[FullyAsyncRollouter] 🔥 消费样本 {result['sample_id']}: " + f"[FullyAsyncRollouter] 消费样本 {rollout_sample.sample_id}: " f"{'成功' if success else '失败'}放入到消息队列, " - f"处理时间 {result['processing_time']:.2f}s" + f"处理时间 {rollout_sample.processing_time:.2f}s" ) # 标记结果队列任务完成 @@ -585,5 +652,3 @@ async def get_statistics(self) -> dict: } return stats - - diff --git a/recipe/fully_async_policy/fully_async_trainer.py b/recipe/fully_async_policy/fully_async_trainer.py index 7b1c725f667..ffdf261126f 100644 --- a/recipe/fully_async_policy/fully_async_trainer.py +++ b/recipe/fully_async_policy/fully_async_trainer.py @@ -21,7 +21,7 @@ import ray from omegaconf import OmegaConf -from recipe.fully_async_policy.message_queue import MessageQueueClient, QueueSample +from recipe.fully_async_policy.message_queue import MessageQueueClient, RolloutSample from recipe.fully_async_policy.utils import calculate_one_step_size from verl.experimental.agent_loop.agent_loop import postprocess_agent_loop_outputs from verl.single_controller.ray import RayClassWithInitArgs, RayWorkerGroup @@ -46,16 +46,16 @@ class FullyAsyncTrainer(RayPPOTrainer): """ def __init__( - self, - config, - tokenizer, - role_worker_mapping: dict[Role, WorkerType], - resource_pool_manager: ResourcePoolManager, - ray_worker_group_cls: RayWorkerGroup = RayWorkerGroup, - processor=None, - reward_fn=None, - val_reward_fn=None, - device_name=None, + self, + config, + tokenizer, + role_worker_mapping: dict[Role, WorkerType], + resource_pool_manager: ResourcePoolManager, + ray_worker_group_cls: RayWorkerGroup = RayWorkerGroup, + processor=None, + reward_fn=None, + val_reward_fn=None, + device_name=None, ): # Store the tokenizer for text processing self.tokenizer = tokenizer @@ -104,8 +104,9 @@ def __init__( self.stale_samples_processed = 0 self.current_param_version = 0 - self.required_samples = calculate_one_step_size(self.minimal_bsz, - config.actor_rollout_ref.actor.ppo_mini_batch_size) + self.required_samples = calculate_one_step_size( + self.minimal_bsz, config.actor_rollout_ref.actor.ppo_mini_batch_size + ) def set_message_queue_client(self, message_queue_client: MessageQueueClient): """Set message queue client""" @@ -128,8 +129,7 @@ def _get_samples_from_queue(self) -> tuple[None, None] | tuple[int, Any]: tuple: (epoch, batch_dict, gen_batch_output) """ print( - "[FullyAsyncTrainer] " - f"Requesting {self.required_samples} samples from queue", + f"[FullyAsyncTrainer] Requesting {self.required_samples} samples from queue", flush=True, ) @@ -166,17 +166,18 @@ def _get_samples_from_queue(self) -> tuple[None, None] | tuple[int, Any]: ) queue_samples = [ray.cloudpickle.loads(x) for x in queue_samples] - # Assemble batch + # Assemble batch - now working directly with RolloutSample objects batch = self._assemble_gen_batch_output_from_queue_samples(queue_samples) return 0, batch - def _assemble_gen_batch_output_from_queue_samples(self, queue_samples: list[QueueSample]): + def _assemble_gen_batch_output_from_queue_samples(self, rollout_samples: list[RolloutSample]): """ - Assemble gen_batch_output from queue samples containing AgentLoopOutput + Assemble gen_batch_output from RolloutSample objects + 从 RolloutSample 对象中组装批次,类似 ray_trainer 的 _post_generate_batch 逻辑 Args: - queue_samples: List of samples from queue, each containing AgentLoopOutput + rollout_samples: List of RolloutSample objects Returns: DataProto: Assembled gen_batch_output @@ -184,91 +185,89 @@ def _assemble_gen_batch_output_from_queue_samples(self, queue_samples: list[Queu start_time = time.time() import numpy as np + import torch - if not queue_samples: - raise ValueError("Empty queue_samples provided for batch assembly") - - print(f"[FullyAsyncTrainer] Assembling batch from {len(queue_samples)} queue samples with AgentLoopOutput") - - # Extract AgentLoopOutput and metadata from all samples - agent_loop_outputs = [] - rollout_metadata_list = [] - processing_times = [] - - for sample in queue_samples: - # sample.data is now AgentLoopOutput - agent_loop_outputs.append(sample.data) - rollout_metadata_list.append(sample.rollout_metadata) - processing_times.append(sample.rollout_metadata.get("processing_time", 0)) - - # Use the static method to postprocess AgentLoopOutput list into DataProto - - batch = postprocess_agent_loop_outputs(agent_loop_outputs, self.tokenizer, self.config) - - # Apply _post_generate_batch logic here - batch = self._post_generate_batch_for_agent_outputs(batch, agent_loop_outputs) - - # Collect timing information and metadata - param_versions = [] - sample_timestamps = [] - for metadata in rollout_metadata_list: - # Extract parameter version and timestamp - param_versions.append(metadata.get("rollout_param_version", 0)) - sample_timestamps.append(metadata.get("generation_timestamp", time.time())) - - # Create meta_info - meta_info = { - "timing": {"avg_processing_time": np.mean(processing_times) if processing_times else 0}, - "queue_sample_count": len(queue_samples), - "rollout_param_versions": param_versions, - "sample_timestamps": sample_timestamps, - "param_version_diversity": len(set(param_versions)), - "avg_sample_age": np.mean([time.time() - ts for ts in sample_timestamps]), - } + from verl import DataProto + from verl.trainer.ppo.ray_trainer import compute_response_mask - batch.meta_info.update(meta_info) + if not rollout_samples: + raise ValueError("Empty rollout_samples provided for batch assembly") - end_time = time.time() - print( - f"[FullyAsyncTrainer] Assembled batch with meta_info: " - f"{meta_info}, time elapsed: {end_time - start_time:.2f} seconds" - ) + print(f"[FullyAsyncTrainer] Assembling batch from {len(rollout_samples)} RolloutSample objects") - return batch + # 直接处理 RolloutSample 对象 + processing_times = [rs.processing_time for rs in rollout_samples] - def _post_generate_batch_for_agent_outputs(self, batch, agent_loop_outputs): - """ - Apply _post_generate_batch logic for AgentLoopOutput + # 第一步:从 AgentLoopOutput 创建生成结果的 DataProto + agent_loop_outputs = [rs.agent_loop_output for rs in rollout_samples] + gen_batch_output = postprocess_agent_loop_outputs(agent_loop_outputs, self.tokenizer, self.config) - Args: - batch: DataProto created from AgentLoopWorker.postprocess_agent_loop_outputs - agent_loop_outputs: List of AgentLoopOutput + # 第二步:重建原始 batch 信息 + # 每个 RolloutSample 都是独立的,直接按顺序重建原始数据 + original_batch_list = [] + for rs in rollout_samples: + original_batch_dict = rs.original_batch_dict - Returns: - DataProto: Processed batch with additional metadata - """ - import uuid + # 重建 DataProto + original_batch_item = DataProto.from_single_dict( + { + **{k: v for k, v in original_batch_dict["batch"].items()}, + **{f"__{k}": v for k, v in original_batch_dict["non_tensor_batch"].items()}, + } + ) + original_batch_item.meta_info.update(original_batch_dict["meta_info"]) + original_batch_list.append(original_batch_item) - import numpy as np - import torch + # 合并所有原始样本为一个批次 + if original_batch_list: + original_batch = DataProto.from_items(original_batch_list) + else: + # 如果没有原始数据,创建空的 DataProto + original_batch = DataProto.from_single_dict({}) - from verl.trainer.ppo.ray_trainer import compute_response_mask + # 添加 UID + uids = [] + for rs in rollout_samples: + uids.append(f"uid_{rs.sample_id}") + original_batch.non_tensor_batch["uid"] = np.array(uids, dtype=object) - # Add UIDs - batch.non_tensor_batch["uid"] = np.array([str(uuid.uuid4()) for _ in range(len(batch.batch))], dtype=object) + # 直接合并原始数据和生成结果,不需要 repeat + # 因为队列中的每个 RolloutSample 都已经是独立的样本 + final_batch = original_batch.union(gen_batch_output) - # response_mask should already be in batch from AgentLoopWorker.postprocess_agent_loop_outputs - if "response_mask" not in batch.batch.keys(): - batch.batch["response_mask"] = compute_response_mask(batch) + # 计算 response_mask(如果不存在) + if "response_mask" not in final_batch.batch.keys(): + final_batch.batch["response_mask"] = compute_response_mask(final_batch) - # Balance the number of valid tokens across DP ranks if needed + # 平衡批次(如果配置了) if self.config.trainer.balance_batch: - self._balance_batch(batch, metrics={}) + self._balance_batch(final_batch, metrics={}) + + # 计算全局有效 token 数 + if "attention_mask" in final_batch.batch: + final_batch.meta_info["global_token_num"] = torch.sum(final_batch.batch["attention_mask"], dim=-1).tolist() + + # 收集统计信息和元数据(直接从 RolloutSample 中获取) + param_versions = [rs.param_version for rs in rollout_samples] + sample_timestamps = [rs.generation_timestamp for rs in rollout_samples] + + # 创建 meta_info + final_batch.meta_info.update( + { + "rollout_param_versions": param_versions, + "sample_timestamps": sample_timestamps, + "avg_processing_time": np.mean(processing_times) if processing_times else 0, + "max_processing_time": np.max(processing_times) if processing_times else 0, + "param_version_diversity": len(set(param_versions)) if param_versions else 0, + "avg_sample_age": np.mean([time.time() - ts for ts in sample_timestamps]) if sample_timestamps else 0, + "assembly_time": time.time() - start_time, + } + ) - # compute global_valid tokens - batch.meta_info["global_token_num"] = torch.sum(batch.batch["attention_mask"], dim=-1).tolist() + print(f"[FullyAsyncTrainer] Batch assembly completed in {time.time() - start_time:.2f}s") + print(f"[FullyAsyncTrainer] {final_batch}") - return batch + return final_batch def _create_actor_rollout_classes(self): # create actor @@ -411,33 +410,29 @@ def _trigger_parameter_sync_after_step(self): ) ray.get(self.param_synchronizer.sync_weights.remote(self.current_param_version)) - def _compute_sample_freshness_metrics(self, batch_samples: list[QueueSample]) -> dict: + def _compute_sample_freshness_metrics(self, rollout_samples: list[RolloutSample]) -> dict: """ Compute sample freshness metrics Args: - batch_samples: List of queue samples + rollout_samples: List of RolloutSample objects Returns: dict: Dictionary of freshness metrics """ - if not batch_samples: + if not rollout_samples: return {} try: - # Extract parameter versions and timestamps + # Extract parameter versions and timestamps directly from RolloutSample sample_ages = [] sample_latencies = [] current_time = time.time() - for sample in batch_samples: - # Get information from rollout_metadata - if hasattr(sample, "rollout_metadata") and sample.rollout_metadata: - rollout_version = sample.rollout_metadata.get("rollout_param_version", 0) - generation_time = sample.rollout_metadata.get("generation_timestamp", current_time) - else: - rollout_version = 0 - generation_time = current_time + for sample in rollout_samples: + # Get information directly from RolloutSample + rollout_version = sample.param_version + generation_time = sample.generation_timestamp age = max(0, self.current_param_version - rollout_version) latency = max(0, current_time - generation_time) @@ -462,4 +457,3 @@ def _compute_sample_freshness_metrics(self, batch_samples: list[QueueSample]) -> except Exception as e: logger.error(f"Error computing freshness metrics: {e}") return {"freshness/error": str(e)} - diff --git a/recipe/fully_async_policy/message_queue.py b/recipe/fully_async_policy/message_queue.py index 9a093296743..4c3232e561b 100644 --- a/recipe/fully_async_policy/message_queue.py +++ b/recipe/fully_async_policy/message_queue.py @@ -25,9 +25,25 @@ @dataclass -class QueueSample: - data: Any - rollout_metadata: dict[str, Any] +class RolloutSample: + """Enhanced rollout sample containing both original batch info and AgentLoopOutput""" + + # Original batch information (preserved from _prepare_generate_batch) + original_batch_dict: dict[str, Any] + + # AgentLoopOutput from generation + agent_loop_output: Any # AgentLoopOutput + + # Metadata + sample_id: str + epoch: int + rollout_n_index: int # Index within the rollout.n repetitions (0, 1, ..., n-1) + original_sample_index: int # Index of the original sample before repetition + + # Processing metadata + processing_time: float + generation_timestamp: float + param_version: int @ray.remote(num_cpus=2, max_concurrency=20) @@ -236,13 +252,17 @@ async def get_memory_usage(self) -> dict: sample = list(self.queue)[0] try: sample_size = sys.getsizeof(sample) - if hasattr(sample.data, "batch") and hasattr(sample.data.batch, "__len__"): - # If batch info is available, estimate data size - batch_size = len(sample.data.batch) - sample_size += batch_size * 1000 # Roughly estimate 1KB per batch entry + # Since we now store RolloutSample directly, estimate based on its components + if hasattr(sample, "original_batch_dict") and sample.original_batch_dict: + # Estimate batch data size + batch_data = sample.original_batch_dict.get("batch", {}) + sample_size += len(batch_data) * 1000 # Roughly estimate 1KB per batch entry + if hasattr(sample, "agent_loop_output"): + # Estimate AgentLoopOutput size + sample_size += 5000 # Roughly estimate 5KB for AgentLoopOutput total_size = sample_size * sample_count except Exception: - total_size = sample_count * 10000 # Roughly estimate 10KB per sample + total_size = sample_count * 15000 # Roughly estimate 15KB per RolloutSample return { "queue_samples": sample_count, diff --git a/recipe/fully_async_policy/utils.py b/recipe/fully_async_policy/utils.py index 71ae7c7d16d..d9afa0a9ab1 100644 --- a/recipe/fully_async_policy/utils.py +++ b/recipe/fully_async_policy/utils.py @@ -15,5 +15,6 @@ # Calculate the number of samples needed + def calculate_one_step_size(minimal_bsz, ppo_mini_batch_size): return minimal_bsz * ppo_mini_batch_size diff --git a/recipe/one_step_off_policy/detach_sharding_manager.py b/recipe/one_step_off_policy/detach_sharding_manager.py index 6b304baa276..a8a7a12c0ba 100644 --- a/recipe/one_step_off_policy/detach_sharding_manager.py +++ b/recipe/one_step_off_policy/detach_sharding_manager.py @@ -74,5 +74,3 @@ def postprocess_data(self, data: DataProto) -> DataProto: return data return data.chunk(chunks=self.tp_size)[self.tp_rank] - - diff --git a/recipe/one_step_off_policy/fsdp_workers.py b/recipe/one_step_off_policy/fsdp_workers.py index 04c66c8b60a..086f109e434 100644 --- a/recipe/one_step_off_policy/fsdp_workers.py +++ b/recipe/one_step_off_policy/fsdp_workers.py @@ -39,8 +39,7 @@ from verl.utils.import_utils import import_external_libs from verl.utils.model import get_generation_config, update_model_config from verl.utils.vllm_utils import patch_vllm_moe_model_weight_loader -from verl.workers.fsdp_workers import ActorRolloutRefWorker, AsyncActorRolloutRefWorker -from verl.workers.fsdp_workers import CriticWorker +from verl.workers.fsdp_workers import ActorRolloutRefWorker, AsyncActorRolloutRefWorker, CriticWorker logger = logging.getLogger(__file__) logger.setLevel(os.getenv("VERL_LOGGING_LEVEL", "WARN")) @@ -60,12 +59,10 @@ def get_inference_model(rollout): """ inference_engine = rollout.inference_engine # 判断inference_engine的类型 - if hasattr(inference_engine, 'llm_engine'): + if hasattr(inference_engine, "llm_engine"): # LLM类型 - vLLMRollout - inference_model = ( - inference_engine.llm_engine.model_executor.driver_worker.worker.model_runner.model - ) - elif hasattr(inference_engine, 'worker'): + inference_model = inference_engine.llm_engine.model_executor.driver_worker.worker.model_runner.model + elif hasattr(inference_engine, "worker"): # WorkerWrapperBase类型 - vLLMAsyncRollout inference_model = inference_engine.worker.model_runner.model else: @@ -77,7 +74,6 @@ def get_inference_model(rollout): class DetachNcclSync(ActorRolloutRefWorker): - def _get_actor_params(self): pass @@ -234,6 +230,7 @@ def init_model(self): log_gpu_memory_usage(f"After building {rollout_name} rollout", logger=logger) from .detach_sharding_manager import DetachShardingManager + sharding_manager = DetachShardingManager( inference_engine=rollout.inference_engine, device_mesh=rollout_device_mesh ) @@ -260,7 +257,7 @@ def __init__(self, config: DictConfig, role: str): @register(dispatch_mode=Dispatch.ONE_TO_ALL) def init_model(self): - print(f"[DetachAsyncRolloutWorker] init_model") + print("[DetachAsyncRolloutWorker] init_model") DetachRolloutWorker.init_model(self) self.vllm_tp_size = self.config.rollout.tensor_model_parallel_size @@ -269,5 +266,3 @@ def init_model(self): # used for sleep/wake_up self.rollout.sharding_manager = self.rollout_sharding_manager - - diff --git a/recipe/one_step_off_policy/main_ppo.py b/recipe/one_step_off_policy/main_ppo.py index d9d8f0bb849..0dcdbef3705 100644 --- a/recipe/one_step_off_policy/main_ppo.py +++ b/recipe/one_step_off_policy/main_ppo.py @@ -60,26 +60,26 @@ def run(self, config): # Define worker classes based on the actor strategy. if config.actor_rollout_ref.actor.strategy == "fsdp2": assert config.actor_rollout_ref.actor.strategy == config.critic.strategy - from verl.single_controller.ray import RayWorkerGroup - from recipe.one_step_off_policy.fsdp_workers import ( + CriticWorker, DetachActorWorker, - DetachRolloutWorker, DetachAsyncRolloutWorker, - CriticWorker, + DetachRolloutWorker, ) + from verl.single_controller.ray import RayWorkerGroup + ray_worker_group_cls = RayWorkerGroup elif config.actor_rollout_ref.actor.strategy == "megatron": assert config.actor_rollout_ref.actor.strategy == config.critic.strategy - from verl.single_controller.ray.megatron import NVMegatronRayWorkerGroup - from recipe.one_step_off_policy.megatron_workers import ( + CriticWorker, DetachActorWorker, - DetachRolloutWorker, DetachAsyncRolloutWorker, - CriticWorker, + DetachRolloutWorker, ) + from verl.single_controller.ray.megatron import NVMegatronRayWorkerGroup + ray_worker_group_cls = NVMegatronRayWorkerGroup else: @@ -90,7 +90,8 @@ def run(self, config): role_worker_mapping = { Role.Actor: ray.remote(DetachActorWorker), Role.Rollout: ray.remote( - DetachAsyncRolloutWorker if config.actor_rollout_ref.rollout.mode == "async" else DetachRolloutWorker), + DetachAsyncRolloutWorker if config.actor_rollout_ref.rollout.mode == "async" else DetachRolloutWorker + ), Role.Critic: ray.remote(CriticWorker), } @@ -132,7 +133,7 @@ def run(self, config): # Add a reference policy worker if KL loss or KL reward is used. if config.algorithm.use_kl_in_reward or config.actor_rollout_ref.actor.use_kl_loss: - role_worker_mapping[Role.RefPolicy] = ray.remote(ActorRolloutRefWorker) + role_worker_mapping[Role.RefPolicy] = ray.remote(DetachActorWorker) mapping[Role.RefPolicy] = global_pool_id # Load the reward manager for training and validation. diff --git a/recipe/one_step_off_policy/megatron_workers.py b/recipe/one_step_off_policy/megatron_workers.py index 9011f5a6023..5b338c5be42 100644 --- a/recipe/one_step_off_policy/megatron_workers.py +++ b/recipe/one_step_off_policy/megatron_workers.py @@ -27,8 +27,11 @@ from verl.utils.device import get_device_name, get_torch_device from verl.utils.fs import copy_to_local from verl.utils.vllm_utils import patch_vllm_moe_model_weight_loader -from verl.workers.megatron_workers import ActorRolloutRefWorker, AsyncActorRolloutRefWorker -from verl.workers.megatron_workers import CriticWorker, RewardModelWorker +from verl.workers.megatron_workers import ( + ActorRolloutRefWorker, + AsyncActorRolloutRefWorker, + CriticWorker, +) logger = logging.getLogger(__file__) logger.setLevel(os.getenv("VERL_LOGGING_LEVEL", "WARN")) @@ -166,6 +169,7 @@ def init_model(self): log_gpu_memory_usage("After building vllm rollout", logger=logger) from sharding_manager import DetachShardingManager + rollout_sharding_manager = DetachShardingManager( inference_engine=rollout.inference_engine, device_mesh=rollout_device_mesh ) @@ -193,4 +197,4 @@ def __init__(self, config: DictConfig, role: str): @register(dispatch_mode=Dispatch.ONE_TO_ALL) def init_model(self): - DetachRolloutWorker.init_model(self) \ No newline at end of file + DetachRolloutWorker.init_model(self) From 966f58df6bdb8e8353f329e274f598730ca101c4 Mon Sep 17 00:00:00 2001 From: ArronHZG Date: Fri, 15 Aug 2025 10:50:00 +0800 Subject: [PATCH 049/182] RolloutSample --- .../fully_async_rollouter.py | 95 +++++++++++-------- .../fully_async_policy/fully_async_trainer.py | 10 +- 2 files changed, 59 insertions(+), 46 deletions(-) diff --git a/recipe/fully_async_policy/fully_async_rollouter.py b/recipe/fully_async_policy/fully_async_rollouter.py index 2e729228153..3be6661c8e1 100644 --- a/recipe/fully_async_policy/fully_async_rollouter.py +++ b/recipe/fully_async_policy/fully_async_rollouter.py @@ -35,16 +35,16 @@ class FullyAsyncRollouter(RayPPOTrainer): """ def __init__( - self, - config, - tokenizer, - role_worker_mapping: dict[Role, WorkerType], - resource_pool_manager: ResourcePoolManager, - ray_worker_group_cls: RayWorkerGroup = RayWorkerGroup, - processor=None, - reward_fn=None, - val_reward_fn=None, - device_name=None, + self, + config, + tokenizer, + role_worker_mapping: dict[Role, WorkerType], + resource_pool_manager: ResourcePoolManager, + ray_worker_group_cls: RayWorkerGroup = RayWorkerGroup, + processor=None, + reward_fn=None, + val_reward_fn=None, + device_name=None, ): # Store the tokenizer for text processing self.tokenizer = tokenizer @@ -217,7 +217,12 @@ def _init_async_rollout_manager(self): async def _feed_samples(self): continuous_iterator = self._create_continuous_iterator() sample_count = 0 + should_stop = False + for epoch, batch_dict in continuous_iterator: + if should_stop: # 检查停止标志 + break + # 类似 _prepare_generate_batch 的逻辑:分离数据 original_batch, gen_data = self._prepare_single_generation_data(batch_dict) @@ -227,6 +232,7 @@ async def _feed_samples(self): for rollout_n_index in range(n_repeats): sample_id = f"sample_{epoch}_{sample_count}_{rollout_n_index}" + # 创建部分 RolloutSample,不包含 _gen_data(因为它不在数据类定义中) partial_rollout_sample = RolloutSample( original_batch_dict=original_batch, agent_loop_output=None, # 待处理后填充 @@ -237,10 +243,10 @@ async def _feed_samples(self): processing_time=0.0, # 待处理后填充 generation_timestamp=0.0, # 待处理后填充 param_version=0, # 待处理后填充 - _gen_data=gen_data, # 临时字段,处理完后删除 ) - # 将生成数据附加到 RolloutSample 中(临时字段) + # 动态添加临时字段(处理完后删除) + partial_rollout_sample._gen_data = gen_data await self.pending_samples_queue.put(partial_rollout_sample) @@ -250,6 +256,7 @@ async def _feed_samples(self): f"[FullyAsyncRollouter] 达到最大步数,停止添加新样本 " f"{self.global_steps} >= {self.total_rollout_steps}" ) + should_stop = True # 设置停止标志 break self.global_steps += 1 @@ -258,6 +265,7 @@ async def _feed_samples(self): # 发送结束信号 await self.pending_samples_queue.put("DONE") + print(f"[FullyAsyncRollouter] 样本添加完成,总共添加了 {self.global_steps} 个步骤的样本") def _prepare_single_generation_data(self, batch_dict): """ @@ -344,11 +352,16 @@ async def _submit_worker(self): async def _process_single_sample_streaming(self, partial_rollout_sample): """流式处理单个样本""" - # 检查是否需要暂停处理 - if await self._should_pause_generation(): - print(f"[FullyAsyncRollouter] 暂停处理样本 {partial_rollout_sample.sample_id}") - # 暂停时重新放回队列 - await self.pending_samples_queue.put(partial_rollout_sample) + # 检查是否需要暂停处理,如果需要暂停则等待resume信号 + while await self._should_pause_generation() and self.running: + print(f"[FullyAsyncRollouter] 暂停处理样本 {partial_rollout_sample.sample_id},等待resume...") + async with self.lock: + await self.condition.wait() + print(f"[FullyAsyncRollouter] 样本 {partial_rollout_sample.sample_id} 收到resume信号,继续处理") + + # 如果系统已停止,跳过处理 + if not self.running: + print(f"[FullyAsyncRollouter] 系统已停止,跳过样本 {partial_rollout_sample.sample_id}") return start_time = time.time() @@ -575,33 +588,32 @@ async def _should_pause_generation(self) -> bool: queue_size = queue_stats["queue_size"] current_trainer_version = queue_stats["current_param_version"] - async with self.lock: - version_diff = self.current_param_version - current_trainer_version - - if version_diff > self.staleness_threshold: - print( - "[FullyAsyncRollouter] " - f"Should pause due to version_diff > self.staleness_threshold: " - f"rollout_version={self.current_param_version}, " - f"trainer_version={current_trainer_version}, diff={version_diff}" - ) - return True + version_diff = self.current_param_version - current_trainer_version - if queue_size >= self.max_queue_size: - print( - f"[FullyAsyncRollouter] Should pause due to full queue: " - f"size={queue_size}, max={self.max_queue_size}" - ) - return True + if version_diff > self.staleness_threshold: + print( + "[FullyAsyncRollouter] " + f"Should pause due to version_diff > self.staleness_threshold: " + f"rollout_version={self.current_param_version}, " + f"trainer_version={current_trainer_version}, diff={version_diff}" + ) + return True - if self.train_step_samples >= self.max_required_samples: - print( - f"[FullyAsyncRollouter] Should pause due to step_generated_samples >= max_required_samples: " - f"self.step_generated_samples={self.train_step_samples}, max={self.max_required_samples}" - ) - return True + if queue_size >= self.max_queue_size: + print( + f"[FullyAsyncRollouter] Should pause due to full queue: " + f"size={queue_size}, max={self.max_queue_size}" + ) + return True - return False + if self.train_step_samples >= self.max_required_samples: + print( + f"[FullyAsyncRollouter] Should pause due to step_generated_samples >= max_required_samples: " + f"self.step_generated_samples={self.train_step_samples}, max={self.max_required_samples}" + ) + return True + + return False async def pause(self) -> bool: """pause rollout @@ -652,3 +664,4 @@ async def get_statistics(self) -> dict: } return stats + diff --git a/recipe/fully_async_policy/fully_async_trainer.py b/recipe/fully_async_policy/fully_async_trainer.py index ffdf261126f..072a26fea35 100644 --- a/recipe/fully_async_policy/fully_async_trainer.py +++ b/recipe/fully_async_policy/fully_async_trainer.py @@ -38,7 +38,7 @@ logger = logging.getLogger(__name__) -@ray.remote +@ray.remote(num_cpus=10) class FullyAsyncTrainer(RayPPOTrainer): """ A fully asynchronous PPO trainer that obtains samples from a MessageQueue for training. @@ -373,14 +373,14 @@ def fit(self): "statistics/current_param_version": self.current_param_version, } ) - batch, reward_extra_infos_dict = self._process_batch_common(batch, metrics, timing_raw) - self._log_rollout(batch, reward_extra_infos_dict, timing_raw) - self._check_save_checkpoint(is_last_step, timing_raw) + # batch, reward_extra_infos_dict = self._process_batch_common(batch, metrics, timing_raw) + # self._log_rollout(batch, reward_extra_infos_dict, timing_raw) + # self._check_save_checkpoint(is_last_step, timing_raw) # self._collect_metrics(batch, epoch, metrics, timing_raw) # Trigger parameter synchronization after training step - self._trigger_parameter_sync_after_step() + # self._trigger_parameter_sync_after_step() print(f"[FullyAsyncTrainer] global_steps: {self.global_steps}") self.global_steps += 1 From 28809b521a7a0377752f4fe342d263976a2d64ae Mon Sep 17 00:00:00 2001 From: ArronHZG Date: Fri, 15 Aug 2025 18:56:28 +0800 Subject: [PATCH 050/182] success rollout --- .../fully_async_rollouter.py | 331 +++++++----------- .../fully_async_policy/fully_async_trainer.py | 74 ++-- recipe/fully_async_policy/message_queue.py | 39 +-- tests/special_e2e/run_fully_async_policy.sh | 2 +- .../rollout/vllm_rollout/vllm_async_server.py | 20 +- .../rollout/vllm_rollout/vllm_rollout_spmd.py | 3 - 6 files changed, 184 insertions(+), 285 deletions(-) diff --git a/recipe/fully_async_policy/fully_async_rollouter.py b/recipe/fully_async_policy/fully_async_rollouter.py index 3be6661c8e1..16b68b3e819 100644 --- a/recipe/fully_async_policy/fully_async_rollouter.py +++ b/recipe/fully_async_policy/fully_async_rollouter.py @@ -35,16 +35,16 @@ class FullyAsyncRollouter(RayPPOTrainer): """ def __init__( - self, - config, - tokenizer, - role_worker_mapping: dict[Role, WorkerType], - resource_pool_manager: ResourcePoolManager, - ray_worker_group_cls: RayWorkerGroup = RayWorkerGroup, - processor=None, - reward_fn=None, - val_reward_fn=None, - device_name=None, + self, + config, + tokenizer, + role_worker_mapping: dict[Role, WorkerType], + resource_pool_manager: ResourcePoolManager, + ray_worker_group_cls: RayWorkerGroup = RayWorkerGroup, + processor=None, + reward_fn=None, + val_reward_fn=None, + device_name=None, ): # Store the tokenizer for text processing self.tokenizer = tokenizer @@ -111,9 +111,9 @@ def __init__( self.message_queue_client = None # Concurrency control - self.running = False self.paused = False - # Initialize async locks directly - asyncio.Lock() creation is synchronous + + # Initialize async locks directly self.lock = asyncio.Lock() self.condition = asyncio.Condition(self.lock) @@ -126,8 +126,14 @@ def __init__( self.async_rollout_manager = None - # 流式处理相关配置 - self.max_concurrent_samples = async_config.get("max_concurrent_samples", 512) # 最大并发处理样本数 + # Calculate the samples needed for a train, used to calculate staleness and interrupt rollout + self.required_samples = calculate_one_step_size( + self.minimal_bsz, config.actor_rollout_ref.actor.ppo_mini_batch_size + ) + self.max_required_samples = self.required_samples * (self.staleness_threshold + 1) + + # 单次最多扔一次迭代需要的样本 + self.max_concurrent_samples = self.required_samples # 流式处理统计 self.max_processing_time = 0.0 # 最长处理时间 @@ -135,14 +141,9 @@ def __init__( self.active_sample_count = 0 # 当前正在处理的样本数 self.queue_full_pause_count = 0 # 队列满导致的暂停次数 - # Calculate the samples needed for a train, used to calculate staleness and interrupt rollout - self.required_samples = calculate_one_step_size( - self.minimal_bsz, config.actor_rollout_ref.actor.ppo_mini_batch_size - ) - self.max_required_samples = self.required_samples * (self.staleness_threshold + 1) - # queue size self.max_queue_size = self.max_required_samples * 10 # x 10 avoid deadlock + print(f"[FullyAsyncRollouter] {self.max_queue_size}") async def set_message_queue_client(self, message_queue_client: MessageQueueClient): """Set message queue client""" @@ -213,60 +214,6 @@ def _init_async_rollout_manager(self): worker_group=self.rollout_wg, ) - # 添加样本到待处理队列的协程 - async def _feed_samples(self): - continuous_iterator = self._create_continuous_iterator() - sample_count = 0 - should_stop = False - - for epoch, batch_dict in continuous_iterator: - if should_stop: # 检查停止标志 - break - - # 类似 _prepare_generate_batch 的逻辑:分离数据 - original_batch, gen_data = self._prepare_single_generation_data(batch_dict) - - # 根据 rollout.n 进行重复 - n_repeats = self.config.actor_rollout_ref.rollout.n - - for rollout_n_index in range(n_repeats): - sample_id = f"sample_{epoch}_{sample_count}_{rollout_n_index}" - - # 创建部分 RolloutSample,不包含 _gen_data(因为它不在数据类定义中) - partial_rollout_sample = RolloutSample( - original_batch_dict=original_batch, - agent_loop_output=None, # 待处理后填充 - sample_id=sample_id, - epoch=epoch, - rollout_n_index=rollout_n_index, - original_sample_index=sample_count, - processing_time=0.0, # 待处理后填充 - generation_timestamp=0.0, # 待处理后填充 - param_version=0, # 待处理后填充 - ) - - # 动态添加临时字段(处理完后删除) - partial_rollout_sample._gen_data = gen_data - - await self.pending_samples_queue.put(partial_rollout_sample) - - # 检查是否到达最后一步 - if self.global_steps >= self.total_rollout_steps: - print( - f"[FullyAsyncRollouter] 达到最大步数,停止添加新样本 " - f"{self.global_steps} >= {self.total_rollout_steps}" - ) - should_stop = True # 设置停止标志 - break - - self.global_steps += 1 - - sample_count += 1 - - # 发送结束信号 - await self.pending_samples_queue.put("DONE") - print(f"[FullyAsyncRollouter] 样本添加完成,总共添加了 {self.global_steps} 个步骤的样本") - def _prepare_single_generation_data(self, batch_dict): """ 类似 ray_trainer._prepare_generate_batch 的逻辑,但针对单个样本 @@ -316,26 +263,87 @@ def _prepare_single_generation_data(self, batch_dict): return original_batch_dict, gen_batch - async def _submit_worker(self): + # 添加样本到待处理队列的协程 + async def _feed_samples(self): + continuous_iterator = self._create_continuous_iterator() + sample_count = 0 + should_stop = False + + for epoch, batch_dict in continuous_iterator: + if should_stop: # 检查停止标志 + break + + # 类似 _prepare_generate_batch 的逻辑:分离数据 + original_batch, gen_data = self._prepare_single_generation_data(batch_dict) + + # 根据 rollout.n 进行重复 + for rollout_n_index in range(self.config.actor_rollout_ref.rollout.n): + sample_id = f"sample_{epoch}_{sample_count}_{rollout_n_index}" + + # 创建部分 RolloutSample,不包含 _gen_data(因为它不在数据类定义中) + partial_rollout_sample = RolloutSample( + original_batch_dict=original_batch, + agent_loop_output=None, # 待处理后填充 + sample_id=sample_id, + epoch=epoch, + rollout_n_index=rollout_n_index, + original_sample_index=sample_count, + processing_time=0.0, # 待处理后填充 + generation_timestamp=0.0, # 待处理后填充 + param_version=0, # 待处理后填充 + _gen_data=gen_data, + ) + + await self.pending_queue.put(partial_rollout_sample) + + # 检查是否到达最后一步 + if self.global_steps >= self.total_rollout_steps: + print( + f"[FullyAsyncRollouter] 达到最大步数,停止添加新样本 " + f"{self.global_steps} >= {self.total_rollout_steps}" + ) + should_stop = True # 设置停止标志 + break + + self.global_steps += 1 + + sample_count += 1 + + # 发送结束信号 + await self.pending_queue.put("DONE") + print(f"[FullyAsyncRollouter] 样本添加完成,总共添加了 {self.global_steps} 个步骤的样本") + + async def _processor_worker(self): """流式处理工作协程 - 逐个样本立即提交处理,不等待批次""" - active_tasks = set() while True: - # 获取待处理的部分 RolloutSample - partial_rollout_sample = await self.pending_samples_queue.get() + partial_rollout_sample = await self.pending_queue.get() + self.train_step_samples += 1 + async with self.lock: + if await self._should_pause_generation(): + # 等待已提交的任务结束 + await asyncio.gather(*self.active_tasks, return_exceptions=True) + self.active_tasks = set() + self.paused = True + + while self.paused: + await self.condition.wait() + + # 获取待处理的部分 RolloutSample if partial_rollout_sample == "DONE": print("收到结束信号,等待剩余任务完成...") # 等待所有活动任务完成 - if active_tasks: - await asyncio.gather(*active_tasks, return_exceptions=True) + if self.active_tasks: + await asyncio.gather(*self.active_tasks, return_exceptions=True) break # 检查并发数是否超限 - while len(active_tasks) >= self.max_concurrent_samples: - print(f"达到最大并发数 {self.max_concurrent_samples},等待任务完成...") + while len(self.active_tasks) >= self.max_concurrent_samples: # 等待至少一个任务完成 - done_tasks, active_tasks = await asyncio.wait(active_tasks, return_when=asyncio.FIRST_COMPLETED) + done_tasks, self.active_tasks = await asyncio.wait( + self.active_tasks, return_when=asyncio.FIRST_COMPLETED + ) # 清理已完成的任务 for task in done_tasks: await task @@ -345,92 +353,51 @@ async def _submit_worker(self): self._process_single_sample_streaming(partial_rollout_sample), name=f"process_{partial_rollout_sample.sample_id}", ) - active_tasks.add(task) + self.active_tasks.add(task) # 标记队列任务完成 - self.pending_samples_queue.task_done() + self.pending_queue.task_done() async def _process_single_sample_streaming(self, partial_rollout_sample): """流式处理单个样本""" - # 检查是否需要暂停处理,如果需要暂停则等待resume信号 - while await self._should_pause_generation() and self.running: - print(f"[FullyAsyncRollouter] 暂停处理样本 {partial_rollout_sample.sample_id},等待resume...") - async with self.lock: - await self.condition.wait() - print(f"[FullyAsyncRollouter] 样本 {partial_rollout_sample.sample_id} 收到resume信号,继续处理") - - # 如果系统已停止,跳过处理 - if not self.running: - print(f"[FullyAsyncRollouter] 系统已停止,跳过样本 {partial_rollout_sample.sample_id}") - return - - start_time = time.time() - - # 从 RolloutSample 中提取生成数据(临时字段) - gen_data = partial_rollout_sample._gen_data - - # 将单个样本数据包装成 DataProto (用于 generate_single_sample_async) - gen_batch_single = DataProto.from_items([gen_data]) # 调用异步生成方法 agent_loop_output, processing_time = await self.async_rollout_manager.generate_single_sample_async( - gen_batch_single, partial_rollout_sample.sample_id + partial_rollout_sample._gen_data, partial_rollout_sample.sample_id ) - end_time = time.time() - # 直接更新 RolloutSample 对象,填充剩余字段 partial_rollout_sample.agent_loop_output = agent_loop_output partial_rollout_sample.processing_time = processing_time partial_rollout_sample.generation_timestamp = time.time() partial_rollout_sample.param_version = self.current_param_version - # 删除临时字段 - delattr(partial_rollout_sample, "_gen_data") - # 直接放入结果队列 await self.result_queue.put(partial_rollout_sample) - async with self.lock: - self.processed_sample_count += 1 - # 更新最大处理时间统计 - if processing_time > self.max_processing_time: - self.max_processing_time = processing_time - - print( - f"[FullyAsyncRollouter] 样本 {partial_rollout_sample.sample_id} 处理完成," - f"耗时 {processing_time:.2f}s {end_time - start_time:.2f}s" - ) + self.processed_sample_count += 1 + # 更新最大处理时间统计 + if processing_time > self.max_processing_time: + self.max_processing_time = processing_time + + print(f"[FullyAsyncRollouter] process {partial_rollout_sample.sample_id} cost {processing_time:.2f}s") async def _consumer_worker(self): """消费者协程,负责从结果队列获取处理结果并放入消息队列""" while True: - async with self.lock: - if not self.running: - # 如果系统停止但还有结果待处理,继续处理 - if self.result_queue.empty(): - break - # 从结果队列获取 RolloutSample rollout_sample = await self.result_queue.get() - # 直接将 RolloutSample 放入消息队列 success = await self.message_queue_client.put_sample( sample=ray.cloudpickle.dumps(rollout_sample), param_version=rollout_sample.param_version, ) - async with self.lock: - if success: - self.total_generated_samples += 1 - self.train_step_samples += 1 - else: - self.dropped_stale_samples += 1 + if success: + self.total_generated_samples += 1 + else: + self.dropped_stale_samples += 1 - print( - f"[FullyAsyncRollouter] 消费样本 {rollout_sample.sample_id}: " - f"{'成功' if success else '失败'}放入到消息队列, " - f"处理时间 {rollout_sample.processing_time:.2f}s" - ) + print(f"[FullyAsyncRollouter] submit {rollout_sample.sample_id} {'success' if success else 'error'}") # 标记结果队列任务完成 self.result_queue.task_done() @@ -473,12 +440,13 @@ async def _streaming_generation_main(self): print(f"[FullyAsyncRollouter] 启动流式处理模式,最大并发样本数: {self.max_concurrent_samples}") # 初始化异步队列 - self.pending_samples_queue = asyncio.Queue(maxsize=self.max_concurrent_samples) + self.pending_queue = asyncio.Queue(maxsize=100) + self.active_tasks = set() self.result_queue = asyncio.Queue() # 启动流式处理协程和消费者协程 self.feed_task = asyncio.create_task(self._feed_samples()) - self.stream_processor_task = asyncio.create_task(self._submit_worker()) + self.processor_task = asyncio.create_task(self._processor_worker()) self.consumer_task = asyncio.create_task(self._consumer_worker()) # 启动样本添加协程 @@ -488,7 +456,7 @@ async def _streaming_generation_main(self): print("[FullyAsyncRollouter] 样本添加完成") # 等待流式处理完成 - await self.stream_processor_task + await self.processor_task print("[FullyAsyncRollouter] 流式处理完成") # 等待结果队列清空 @@ -500,16 +468,13 @@ async def _streaming_generation_main(self): finally: # 取消所有任务 - if self.stream_processor_task: - self.stream_processor_task.cancel() + if self.processor_task: + self.processor_task.cancel() if self.consumer_task: self.consumer_task.cancel() # 等待任务结束 - await asyncio.gather(self.stream_processor_task, self.consumer_task, return_exceptions=True) - - async with self.lock: - self.running = False + await asyncio.gather(self.processor_task, self.consumer_task, return_exceptions=True) # 发送终止信号 await self.message_queue_client.put_sample( @@ -530,9 +495,7 @@ async def fit(self): raise ValueError("param_synchronizer client not set. Call set_parameter_synchronizer() first.") # 设置运行状态 - async with self.lock: - self.running = True - self.paused = False + self.paused = False # 创建主要的异步任务 generation_task = asyncio.create_task(self._streaming_generation_main()) @@ -566,17 +529,12 @@ async def _async_monitor_loop(self): check_interval = 5.0 while True: - async with self.lock: - if not self.running: - break - await asyncio.sleep(check_interval) - # 定期打印统计信息 current_time = time.time() if current_time - last_stats_time >= stats_interval: stats = await self.get_statistics() - print(f"[FullyAsyncRollouter] {stats}") + print(f"[FullyAsyncRollouter] statistics {stats}") last_stats_time = current_time if not await self._should_pause_generation(): @@ -600,68 +558,49 @@ async def _should_pause_generation(self) -> bool: return True if queue_size >= self.max_queue_size: - print( - f"[FullyAsyncRollouter] Should pause due to full queue: " - f"size={queue_size}, max={self.max_queue_size}" - ) + print(f"[FullyAsyncRollouter] Should pause due to full queue: size={queue_size}, max={self.max_queue_size}") return True - if self.train_step_samples >= self.max_required_samples: + if self.train_step_samples > self.max_required_samples: print( - f"[FullyAsyncRollouter] Should pause due to step_generated_samples >= max_required_samples: " - f"self.step_generated_samples={self.train_step_samples}, max={self.max_required_samples}" + f"[FullyAsyncRollouter] Should pause due to " + f"step_generated_samples {self.train_step_samples} > max_required_samples {self.max_required_samples} " ) return True return False - async def pause(self) -> bool: + async def pause(self): """pause rollout TODO integrated Partial Rollout """ print("[FullyAsyncRollouter] pause") async with self.lock: - if not self.running: - return False - - if self.paused: - return True - self.paused = True - return True - async def resume(self) -> bool: + async def resume(self): """resume rollout TODO integrated Partial Rollout """ print("[FullyAsyncRollouter] resume") async with self.lock: - if not self.running: - return False - - if not self.paused: - return True - self.paused = False self.condition.notify_all() - return True async def get_statistics(self) -> dict: - async with self.lock: - queue_stats = self.message_queue_client.get_statistics_sync() - stats = { - "is_running": self.running, - "total_generated_samples": self.total_generated_samples, - "train_step_samples": self.train_step_samples, - "dropped_stale_samples": self.dropped_stale_samples, - "current_param_version": self.current_param_version, - "queue_size": queue_stats["queue_size"], - "queue_max_size": self.max_queue_size, - "max_concurrent_samples": self.max_concurrent_samples, - "max_processing_time": self.max_processing_time, - "pending_samples_queue_size": self.pending_samples_queue.qsize(), - "result_queue_size": self.result_queue.qsize(), - } - - return stats + queue_stats = self.message_queue_client.get_statistics_sync() + + stats = { + "current_param_version": self.current_param_version, + "total_generated_samples": self.total_generated_samples, + "train_step_samples": self.train_step_samples, + "dropped_stale_samples": self.dropped_stale_samples, + "queue_max_size": self.max_queue_size, + "queue_size": queue_stats["queue_size"], + "max_concurrent_samples": self.max_concurrent_samples, + "pending_queue_size": self.pending_queue.qsize(), + "active_tasks_size": len(self.active_tasks), + "result_queue_size": self.result_queue.qsize(), + } + return stats diff --git a/recipe/fully_async_policy/fully_async_trainer.py b/recipe/fully_async_policy/fully_async_trainer.py index 072a26fea35..fd46fc08b26 100644 --- a/recipe/fully_async_policy/fully_async_trainer.py +++ b/recipe/fully_async_policy/fully_async_trainer.py @@ -167,9 +167,11 @@ def _get_samples_from_queue(self) -> tuple[None, None] | tuple[int, Any]: queue_samples = [ray.cloudpickle.loads(x) for x in queue_samples] # Assemble batch - now working directly with RolloutSample objects - batch = self._assemble_gen_batch_output_from_queue_samples(queue_samples) - - return 0, batch + # batch = self._assemble_gen_batch_output_from_queue_samples(queue_samples) + # print(f" _assemble_gen_batch_output_from_queue_samples {batch}") + return 0, queue_samples + # + # return 0, batch def _assemble_gen_batch_output_from_queue_samples(self, rollout_samples: list[RolloutSample]): """ @@ -344,39 +346,39 @@ def fit(self): epoch, batch = self._get_samples_from_queue() if batch is None: break - - # 更新统计信息 - self.processed_samples += len(batch) if isinstance(batch, list) else 1 - - # 从meta_info中获取参数版本信息 - if hasattr(batch, "meta_info") and batch.meta_info: - rollout_param_versions = batch.meta_info.get("rollout_param_versions", []) - if rollout_param_versions: - # 统计陈旧样本 - stale_count = sum(1 for v in rollout_param_versions if self.current_param_version - v > 1) - self.stale_samples_processed += stale_count - - # 添加新鲜度指标到metrics - if rollout_param_versions: - param_version_diversity = batch.meta_info.get("param_version_diversity", 0) - avg_sample_age = batch.meta_info.get("avg_sample_age", 0) - - metrics.update( - { - "freshness/param_version_diversity": param_version_diversity, - "freshness/avg_sample_age": avg_sample_age, - "freshness/stale_samples_ratio": stale_count / len(rollout_param_versions) - if rollout_param_versions - else 0, - "statistics/processed_samples": self.processed_samples, - "statistics/stale_samples_processed": self.stale_samples_processed, - "statistics/current_param_version": self.current_param_version, - } - ) - # batch, reward_extra_infos_dict = self._process_batch_common(batch, metrics, timing_raw) - # self._log_rollout(batch, reward_extra_infos_dict, timing_raw) - # self._check_save_checkpoint(is_last_step, timing_raw) - + # + # # 更新统计信息 + # self.processed_samples += len(batch) if isinstance(batch, list) else 1 + # + # # 从meta_info中获取参数版本信息 + # if hasattr(batch, "meta_info") and batch.meta_info: + # rollout_param_versions = batch.meta_info.get("rollout_param_versions", []) + # if rollout_param_versions: + # # 统计陈旧样本 + # stale_count = sum(1 for v in rollout_param_versions if self.current_param_version - v > 1) + # self.stale_samples_processed += stale_count + # + # # 添加新鲜度指标到metrics + # if rollout_param_versions: + # param_version_diversity = batch.meta_info.get("param_version_diversity", 0) + # avg_sample_age = batch.meta_info.get("avg_sample_age", 0) + # + # metrics.update( + # { + # "freshness/param_version_diversity": param_version_diversity, + # "freshness/avg_sample_age": avg_sample_age, + # "freshness/stale_samples_ratio": stale_count / len(rollout_param_versions) + # if rollout_param_versions + # else 0, + # "statistics/processed_samples": self.processed_samples, + # "statistics/stale_samples_processed": self.stale_samples_processed, + # "statistics/current_param_version": self.current_param_version, + # } + # ) + # batch, reward_extra_infos_dict = self._process_batch_common(batch, metrics, timing_raw) + # self._log_rollout(batch, reward_extra_infos_dict, timing_raw) + # self._check_save_checkpoint(is_last_step, timing_raw) + # # self._collect_metrics(batch, epoch, metrics, timing_raw) # Trigger parameter synchronization after training step diff --git a/recipe/fully_async_policy/message_queue.py b/recipe/fully_async_policy/message_queue.py index 4c3232e561b..3ece118bc81 100644 --- a/recipe/fully_async_policy/message_queue.py +++ b/recipe/fully_async_policy/message_queue.py @@ -45,6 +45,8 @@ class RolloutSample: generation_timestamp: float param_version: int + _gen_data: Any + @ray.remote(num_cpus=2, max_concurrency=20) class MessageQueue: @@ -71,25 +73,19 @@ def __init__(self, config: DictConfig, max_queue_size: int = 1000): self.running = True # async safe - 在第一次使用时初始化 - self._lock = None - self._consumer_condition = None + self._lock = asyncio.Lock() + self._consumer_condition = asyncio.Condition(self._lock) # statistic message self.total_produced = 0 self.total_consumed = 0 self.dropped_samples = 0 - logger.info( - f"MessageQueue initialized with max_queue_size={max_queue_size}," + print( + f"[MessageQueue] initialized with max_queue_size={max_queue_size}," f"staleness_threshold={self.staleness_threshold}" ) - async def _ensure_async_primitives(self): - """确保异步原语已初始化""" - if self._lock is None: - self._lock = asyncio.Lock() - self._consumer_condition = asyncio.Condition(self._lock) - async def put_sample(self, sample: Any, param_version: int) -> bool: """ Put a batch sample into the queue @@ -101,8 +97,6 @@ async def put_sample(self, sample: Any, param_version: int) -> bool: Returns: bool: Whether the sample was successfully put into the queue """ - await self._ensure_async_primitives() - async with self._lock: # Check freshness staleness = self.current_param_version - param_version @@ -115,12 +109,12 @@ async def put_sample(self, sample: Any, param_version: int) -> bool: if len(self.queue) >= self.max_queue_size: removed = self.queue.popleft() self.dropped_samples += 1 - logger.warning(f"Queue full, dropped sample {removed}") + logger.warning(f"Queue full, dropped sample") self.queue.append(sample) self.total_produced += 1 # Notify waiting consumers - self._consumer_condition.notify() + self._consumer_condition.notify_all() if self.total_produced % 100 == 0: logger.debug(f"MessageQueue stats: produced={self.total_produced}, queue_size={len(self.queue)}") @@ -137,8 +131,6 @@ async def get_samples(self, min_batch_count: int = 1) -> tuple[list[Any], int]: Returns: List[Any]: List of retrieved samples """ - await self._ensure_async_primitives() - async with self._lock: while len(self.queue) < min_batch_count and self.running: print(f"[MessageQueue] consumer_condition {len(self.queue)}") @@ -171,10 +163,9 @@ async def get_sample(self) -> Any | None: Returns: Any: Single sample data or None if queue is closed """ - await self._ensure_async_primitives() - async with self._lock: while len(self.queue) == 0 and self.running: + print(f"[MessageQueue] consumer_condition {len(self.queue)}") await self._consumer_condition.wait() # If queue is closed and empty, return None @@ -188,8 +179,6 @@ async def get_sample(self) -> Any | None: async def update_param_version(self, version: int): """Update current parameter version""" - await self._ensure_async_primitives() - async with self._lock: old_version = self.current_param_version self.current_param_version = version @@ -197,15 +186,11 @@ async def update_param_version(self, version: int): async def get_queue_size(self) -> int: """Get current queue length""" - await self._ensure_async_primitives() - async with self._lock: return len(self.queue) async def get_statistics(self) -> dict[str, Any]: """Get queue statistics""" - await self._ensure_async_primitives() - async with self._lock: return { "queue_size": len(self.queue), @@ -219,8 +204,6 @@ async def get_statistics(self) -> dict[str, Any]: async def clear_queue(self): """Clear the queue""" - await self._ensure_async_primitives() - async with self._lock: cleared_count = len(self.queue) self.queue.clear() @@ -228,8 +211,6 @@ async def clear_queue(self): async def shutdown(self): """Shutdown the message queue""" - await self._ensure_async_primitives() - async with self._lock: self.running = False # Notify all waiting coroutines so they can exit @@ -238,8 +219,6 @@ async def shutdown(self): async def get_memory_usage(self) -> dict: """Get memory usage statistics""" - await self._ensure_async_primitives() - async with self._lock: # Estimate memory usage of samples in queue import sys diff --git a/tests/special_e2e/run_fully_async_policy.sh b/tests/special_e2e/run_fully_async_policy.sh index 337f2991a16..a938499a86b 100644 --- a/tests/special_e2e/run_fully_async_policy.sh +++ b/tests/special_e2e/run_fully_async_policy.sh @@ -46,7 +46,7 @@ gen_prompt_bsz=1 n_resp_per_prompt=3 train_prompt_mini_bsz=1 -total_rollout_steps=1000 +total_rollout_steps=50 # Temperature parameters temperature=1.0 diff --git a/verl/workers/rollout/vllm_rollout/vllm_async_server.py b/verl/workers/rollout/vllm_rollout/vllm_async_server.py index 8c0d608871f..a5cc0b83e59 100644 --- a/verl/workers/rollout/vllm_rollout/vllm_async_server.py +++ b/verl/workers/rollout/vllm_rollout/vllm_async_server.py @@ -57,12 +57,6 @@ def _get_model_runner_workers(vllm_config, init_ray: bool = True): actor_name for actor_name in ray.util.list_named_actors() if actor_name.startswith(f"{wg_prefix}WorkerDict") ] - print(f"namespace: {namespace}") - print(f"wg_prefix: {wg_prefix}") - print(f"vllm_dp_size: {vllm_dp_size}") - print(f"vllm_dp_rank: {vllm_dp_rank}") - print(f"actor_names: {actor_names}") - vllm_tp_size = vllm_config.parallel_config.tensor_parallel_size assert len(actor_names) == vllm_dp_size * vllm_tp_size, ( f"instance_id: {vllm_config.instance_id} has {len(actor_names)} actors, but vllm_dp_size: " @@ -79,7 +73,6 @@ def get_pg_index_and_local_rank(actor_name) -> tuple[int, int]: actor_names = sorted(actor_names, key=get_pg_index_and_local_rank) actor_names = actor_names[vllm_dp_rank * vllm_tp_size : (vllm_dp_rank + 1) * vllm_tp_size] workers: list[WorkerWrapperBase] = [ray.get_actor(actor_name) for actor_name in actor_names] - print(f"instance_id: {vllm_config.instance_id} initializes with external actors: {actor_names}") return workers @@ -90,7 +83,6 @@ class ExternalRayDistributedExecutor(Executor): uses_ray: bool = False def _init_executor(self) -> None: - print("[ExternalRayDistributedExecutor] Initializing ray actors...") self.workers = _get_model_runner_workers(vllm_config=self.vllm_config, init_ray=True) kwargs = dict( @@ -100,11 +92,10 @@ def _init_executor(self) -> None: distributed_init_method="env://", is_driver_worker=True, ) - print(f"ray start instance_id: {self.vllm_config.instance_id} initializes") self.collective_rpc("init_worker", args=([kwargs],)) self.collective_rpc("init_device") self.collective_rpc("load_model") - print(f"ray instance_id: {self.vllm_config.instance_id} initializes finished.") + print(f"instance_id: {self.vllm_config.instance_id} initializes finished.") def collective_rpc( self, @@ -136,7 +127,6 @@ class ExternalZeroMQDistributedExecutor(Executor): uses_ray: bool = False def _init_executor(self) -> None: - print(f"[ExternalZeroMQDistributedExecutor] Initializing ray actors...") addresses = os.environ["VERL_VLLM_ZMQ_ADDRESSES"].split(",") self.context = zmq.Context() self.sockets = [] @@ -152,11 +142,9 @@ def _init_executor(self) -> None: distributed_init_method="env://", is_driver_worker=True, ) - print(f"ZeroMQ start instance_id: {self.vllm_config.instance_id} initializes") self.collective_rpc("init_worker", args=([kwargs],)) self.collective_rpc("init_device") self.collective_rpc("load_model") - print(f"ZeroMQ instance_id: {self.vllm_config.instance_id} initializes finished.") def collective_rpc( self, @@ -275,12 +263,8 @@ async def init_engine(self): # init async llm engine vllm_config = self._create_engine_config(engine_args) - - print(f"AsyncvLLMServer AsyncLLM.from_vllm_config {vllm_config}") self.engine = AsyncLLM.from_vllm_config(vllm_config) - print("AsyncvLLMServer build serving chat") - # build serving chat model_config = self.engine.model_config BASE_MODEL_PATHS = [BaseModelPath(name=model_name, model_path=model_path)] @@ -297,8 +281,6 @@ async def init_engine(self): tool_parser=config.multi_turn.format, # hermes, llama3_json, ... ) - print("AsyncvLLMServer init_engine success") - def _create_engine_config(self, engine_args: AsyncEngineArgs): vllm_config = engine_args.create_engine_config() namespace = ray.get_runtime_context().namespace diff --git a/verl/workers/rollout/vllm_rollout/vllm_rollout_spmd.py b/verl/workers/rollout/vllm_rollout/vllm_rollout_spmd.py index 307e7e77036..0d419dcf177 100644 --- a/verl/workers/rollout/vllm_rollout/vllm_rollout_spmd.py +++ b/verl/workers/rollout/vllm_rollout/vllm_rollout_spmd.py @@ -460,7 +460,6 @@ def get_zeromq_address(self): def init_worker(self, all_kwargs: list[dict[str, Any]]): """Initialize worker engine.""" - print("[vLLMAsyncRollout] init_worker") all_kwargs[0]["rank"] = int(os.environ["RANK"]) all_kwargs[0]["local_rank"] = 0 @@ -471,8 +470,6 @@ def init_worker(self, all_kwargs: list[dict[str, Any]]): def load_model(self, *args, **kwargs): self.inference_engine.load_model(*args, **kwargs) - print(f"[vLLMAsyncRollout] load_model {args} {kwargs}") - # inference engine is initialized now, update sharding manager self.sharding_manager.inference_engine = self.inference_engine self.sharding_manager.model_runner = self.inference_engine.worker.model_runner From 1c06296f91aefbc5f25b28c8a5efdc0292f219c6 Mon Sep 17 00:00:00 2001 From: ArronHZG Date: Fri, 15 Aug 2025 19:04:20 +0800 Subject: [PATCH 051/182] staleness_samples --- .../fully_async_rollouter.py | 34 +++++++++---------- 1 file changed, 17 insertions(+), 17 deletions(-) diff --git a/recipe/fully_async_policy/fully_async_rollouter.py b/recipe/fully_async_policy/fully_async_rollouter.py index 16b68b3e819..fb0787eac69 100644 --- a/recipe/fully_async_policy/fully_async_rollouter.py +++ b/recipe/fully_async_policy/fully_async_rollouter.py @@ -35,16 +35,16 @@ class FullyAsyncRollouter(RayPPOTrainer): """ def __init__( - self, - config, - tokenizer, - role_worker_mapping: dict[Role, WorkerType], - resource_pool_manager: ResourcePoolManager, - ray_worker_group_cls: RayWorkerGroup = RayWorkerGroup, - processor=None, - reward_fn=None, - val_reward_fn=None, - device_name=None, + self, + config, + tokenizer, + role_worker_mapping: dict[Role, WorkerType], + resource_pool_manager: ResourcePoolManager, + ray_worker_group_cls: RayWorkerGroup = RayWorkerGroup, + processor=None, + reward_fn=None, + val_reward_fn=None, + device_name=None, ): # Store the tokenizer for text processing self.tokenizer = tokenizer @@ -103,7 +103,7 @@ def __init__( # Statistics self.total_generated_samples = 0 - self.train_step_samples = 0 + self.staleness_samples = 0 self.dropped_stale_samples = 0 # Worker groups @@ -167,8 +167,8 @@ async def update_param_version(self, version: int): async with self.lock: old_version = self.current_param_version self.current_param_version = version - # every time param change, reset train_step_samples - self.train_step_samples = 0 + # every time param change, reset staleness_samples + self.staleness_samples = 0 print(f"[FullyAsyncRollouter] Parameter version updated from {old_version} to {version}") def _validate_config(self): @@ -318,7 +318,7 @@ async def _processor_worker(self): while True: partial_rollout_sample = await self.pending_queue.get() - self.train_step_samples += 1 + self.staleness_samples += 1 async with self.lock: if await self._should_pause_generation(): @@ -561,10 +561,10 @@ async def _should_pause_generation(self) -> bool: print(f"[FullyAsyncRollouter] Should pause due to full queue: size={queue_size}, max={self.max_queue_size}") return True - if self.train_step_samples > self.max_required_samples: + if self.staleness_samples > self.max_required_samples: print( f"[FullyAsyncRollouter] Should pause due to " - f"step_generated_samples {self.train_step_samples} > max_required_samples {self.max_required_samples} " + f"step_generated_samples {self.staleness_samples} > max_required_samples {self.max_required_samples} " ) return True @@ -593,7 +593,7 @@ async def get_statistics(self) -> dict: stats = { "current_param_version": self.current_param_version, "total_generated_samples": self.total_generated_samples, - "train_step_samples": self.train_step_samples, + "staleness_samples": self.staleness_samples, "dropped_stale_samples": self.dropped_stale_samples, "queue_max_size": self.max_queue_size, "queue_size": queue_stats["queue_size"], From 0412861d95c57c74e366da0ef8045e4db0487f45 Mon Sep 17 00:00:00 2001 From: ArronHZG Date: Fri, 15 Aug 2025 22:45:42 +0800 Subject: [PATCH 052/182] assemble_batch_from_rollout_samples --- recipe/fully_async_policy/batch_utils.py | 124 +++++++ .../fully_async_rollouter.py | 27 +- .../fully_async_policy/fully_async_trainer.py | 104 +----- recipe/fully_async_policy/message_queue.py | 29 +- .../unittest/test_batch_utils.py | 321 ++++++++++++++++++ recipe/fully_async_policy/utils.py | 28 +- 6 files changed, 503 insertions(+), 130 deletions(-) create mode 100644 recipe/fully_async_policy/batch_utils.py create mode 100644 recipe/fully_async_policy/unittest/test_batch_utils.py diff --git a/recipe/fully_async_policy/batch_utils.py b/recipe/fully_async_policy/batch_utils.py new file mode 100644 index 00000000000..806dd9e1579 --- /dev/null +++ b/recipe/fully_async_policy/batch_utils.py @@ -0,0 +1,124 @@ +# Copyright 2025 Meituan Ltd. and/or its affiliates +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import time + +import numpy as np +import torch + +from recipe.fully_async_policy.utils import RolloutSample +from verl import DataProto +from verl.experimental.agent_loop.agent_loop import postprocess_agent_loop_outputs +from verl.trainer.ppo.ray_trainer import compute_response_mask + + +def assemble_batch_from_rollout_samples( + rollout_samples: list[RolloutSample], tokenizer, config, balance_batch: bool = False +) -> DataProto: + """ + Assemble gen_batch_output from RolloutSample objects + 从 RolloutSample 对象中组装批次,类似 ray_trainer 的 _post_generate_batch 逻辑 + + Args: + rollout_samples: List of RolloutSample objects + tokenizer: Tokenizer instance + config: Configuration object containing trainer settings + balance_batch: Whether to balance the batch (simplified version) + + Returns: + DataProto: Assembled gen_batch_output + + Raises: + ValueError: If rollout_samples is empty + """ + start_time = time.time() + + if not rollout_samples: + raise ValueError("Empty rollout_samples provided for batch assembly") + + print(f"[BatchUtils] Assembling batch from {len(rollout_samples)} RolloutSample objects") + + # 直接处理 RolloutSample 对象 + processing_times = [rs.processing_time for rs in rollout_samples] + + # 第一步:从 AgentLoopOutput 创建生成结果的 DataProto + agent_loop_outputs = [rs.agent_loop_output for rs in rollout_samples] + gen_batch_output = postprocess_agent_loop_outputs(agent_loop_outputs, tokenizer, config) + + # 第二步:重建原始 batch 信息 + # 每个 RolloutSample 都是独立的,直接按顺序重建原始数据 + original_batch_list = [] + for rs in rollout_samples: + original_batch_dict = rs.original_batch_dict + + # 重建 DataProto + original_batch_item = DataProto.from_single_dict( + { + **{k: v for k, v in original_batch_dict["batch"].items()}, + **{f"__{k}": v for k, v in original_batch_dict["non_tensor_batch"].items()}, + } + ) + original_batch_item.meta_info.update(original_batch_dict["meta_info"]) + original_batch_list.append(original_batch_item) + + # 合并所有原始样本为一个批次 + if original_batch_list: + original_batch = DataProto.from_items(original_batch_list) + else: + # 如果没有原始数据,创建空的 DataProto + original_batch = DataProto.from_single_dict({}) + + # 添加 UID + uids = [] + for rs in rollout_samples: + uids.append(f"uid_{rs.sample_id}") + original_batch.non_tensor_batch["uid"] = np.array(uids, dtype=object) + + # 直接合并原始数据和生成结果,不需要 repeat + # 因为队列中的每个 RolloutSample 都已经是独立的样本 + final_batch = original_batch.union(gen_batch_output) + + # 计算 response_mask(如果不存在) + if "response_mask" not in final_batch.batch.keys(): + final_batch.batch["response_mask"] = compute_response_mask(final_batch) + + # 简化的批次平衡逻辑(如果需要的话) + if balance_batch and hasattr(config, "trainer") and getattr(config.trainer, "balance_batch", False): + # 注意:这里简化了批次平衡逻辑,如果需要完整功能需要额外参数 + print("[BatchUtils] Batch balancing requested but simplified in static function") + + # 计算全局有效 token 数 + if "attention_mask" in final_batch.batch: + final_batch.meta_info["global_token_num"] = torch.sum(final_batch.batch["attention_mask"], dim=-1).tolist() + + # 收集统计信息和元数据(直接从 RolloutSample 中获取) + param_versions = [rs.param_version for rs in rollout_samples] + sample_timestamps = [rs.generation_timestamp for rs in rollout_samples] + + # 创建 meta_info + final_batch.meta_info.update( + { + "rollout_param_versions": param_versions, + "sample_timestamps": sample_timestamps, + "avg_processing_time": np.mean(processing_times) if processing_times else 0, + "max_processing_time": np.max(processing_times) if processing_times else 0, + "param_version_diversity": len(set(param_versions)) if param_versions else 0, + "avg_sample_age": np.mean([time.time() - ts for ts in sample_timestamps]) if sample_timestamps else 0, + "assembly_time": time.time() - start_time, + } + ) + + print(f"[BatchUtils] Batch assembly completed in {time.time() - start_time:.2f}s") + + return final_batch diff --git a/recipe/fully_async_policy/fully_async_rollouter.py b/recipe/fully_async_policy/fully_async_rollouter.py index fb0787eac69..f166582ef73 100644 --- a/recipe/fully_async_policy/fully_async_rollouter.py +++ b/recipe/fully_async_policy/fully_async_rollouter.py @@ -18,9 +18,8 @@ import ray from omegaconf import OmegaConf -from recipe.fully_async_policy.message_queue import MessageQueueClient, RolloutSample -from recipe.fully_async_policy.utils import calculate_one_step_size -from verl import DataProto +from recipe.fully_async_policy.message_queue import MessageQueueClient +from recipe.fully_async_policy.utils import RolloutSample, calculate_one_step_size from verl.single_controller.ray import RayClassWithInitArgs, RayWorkerGroup from verl.trainer.ppo.ray_trainer import RayPPOTrainer, ResourcePoolManager, Role, WorkerType from verl.utils.tracking import ValidationGenerationsLogger @@ -35,16 +34,16 @@ class FullyAsyncRollouter(RayPPOTrainer): """ def __init__( - self, - config, - tokenizer, - role_worker_mapping: dict[Role, WorkerType], - resource_pool_manager: ResourcePoolManager, - ray_worker_group_cls: RayWorkerGroup = RayWorkerGroup, - processor=None, - reward_fn=None, - val_reward_fn=None, - device_name=None, + self, + config, + tokenizer, + role_worker_mapping: dict[Role, WorkerType], + resource_pool_manager: ResourcePoolManager, + ray_worker_group_cls: RayWorkerGroup = RayWorkerGroup, + processor=None, + reward_fn=None, + val_reward_fn=None, + device_name=None, ): # Store the tokenizer for text processing self.tokenizer = tokenizer @@ -564,7 +563,7 @@ async def _should_pause_generation(self) -> bool: if self.staleness_samples > self.max_required_samples: print( f"[FullyAsyncRollouter] Should pause due to " - f"step_generated_samples {self.staleness_samples} > max_required_samples {self.max_required_samples} " + f"staleness_samples {self.staleness_samples} > max_required_samples {self.max_required_samples} " ) return True diff --git a/recipe/fully_async_policy/fully_async_trainer.py b/recipe/fully_async_policy/fully_async_trainer.py index fd46fc08b26..cbea37c4083 100644 --- a/recipe/fully_async_policy/fully_async_trainer.py +++ b/recipe/fully_async_policy/fully_async_trainer.py @@ -21,9 +21,8 @@ import ray from omegaconf import OmegaConf -from recipe.fully_async_policy.message_queue import MessageQueueClient, RolloutSample -from recipe.fully_async_policy.utils import calculate_one_step_size -from verl.experimental.agent_loop.agent_loop import postprocess_agent_loop_outputs +from recipe.fully_async_policy.message_queue import MessageQueueClient +from recipe.fully_async_policy.utils import RolloutSample, calculate_one_step_size from verl.single_controller.ray import RayClassWithInitArgs, RayWorkerGroup from verl.trainer.ppo import core_algos from verl.trainer.ppo.core_algos import AdvantageEstimator @@ -166,9 +165,10 @@ def _get_samples_from_queue(self) -> tuple[None, None] | tuple[int, Any]: ) queue_samples = [ray.cloudpickle.loads(x) for x in queue_samples] + print(queue_samples) # Assemble batch - now working directly with RolloutSample objects - # batch = self._assemble_gen_batch_output_from_queue_samples(queue_samples) - # print(f" _assemble_gen_batch_output_from_queue_samples {batch}") + batch = self._assemble_gen_batch_output_from_queue_samples(queue_samples) + print(f" _assemble_gen_batch_output_from_queue_samples {batch}") return 0, queue_samples # # return 0, batch @@ -184,91 +184,21 @@ def _assemble_gen_batch_output_from_queue_samples(self, rollout_samples: list[Ro Returns: DataProto: Assembled gen_batch_output """ - start_time = time.time() - - import numpy as np - import torch - - from verl import DataProto - from verl.trainer.ppo.ray_trainer import compute_response_mask - - if not rollout_samples: - raise ValueError("Empty rollout_samples provided for batch assembly") - - print(f"[FullyAsyncTrainer] Assembling batch from {len(rollout_samples)} RolloutSample objects") - - # 直接处理 RolloutSample 对象 - processing_times = [rs.processing_time for rs in rollout_samples] - - # 第一步:从 AgentLoopOutput 创建生成结果的 DataProto - agent_loop_outputs = [rs.agent_loop_output for rs in rollout_samples] - gen_batch_output = postprocess_agent_loop_outputs(agent_loop_outputs, self.tokenizer, self.config) - - # 第二步:重建原始 batch 信息 - # 每个 RolloutSample 都是独立的,直接按顺序重建原始数据 - original_batch_list = [] - for rs in rollout_samples: - original_batch_dict = rs.original_batch_dict - - # 重建 DataProto - original_batch_item = DataProto.from_single_dict( - { - **{k: v for k, v in original_batch_dict["batch"].items()}, - **{f"__{k}": v for k, v in original_batch_dict["non_tensor_batch"].items()}, - } - ) - original_batch_item.meta_info.update(original_batch_dict["meta_info"]) - original_batch_list.append(original_batch_item) - - # 合并所有原始样本为一个批次 - if original_batch_list: - original_batch = DataProto.from_items(original_batch_list) - else: - # 如果没有原始数据,创建空的 DataProto - original_batch = DataProto.from_single_dict({}) - - # 添加 UID - uids = [] - for rs in rollout_samples: - uids.append(f"uid_{rs.sample_id}") - original_batch.non_tensor_batch["uid"] = np.array(uids, dtype=object) - - # 直接合并原始数据和生成结果,不需要 repeat - # 因为队列中的每个 RolloutSample 都已经是独立的样本 - final_batch = original_batch.union(gen_batch_output) - - # 计算 response_mask(如果不存在) - if "response_mask" not in final_batch.batch.keys(): - final_batch.batch["response_mask"] = compute_response_mask(final_batch) + from recipe.fully_async_policy.batch_utils import assemble_batch_from_rollout_samples + + # 使用静态函数进行批次组装 + final_batch = assemble_batch_from_rollout_samples( + rollout_samples=rollout_samples, + tokenizer=self.tokenizer, + config=self.config, + balance_batch=False, # 不使用静态函数的简化版本 + ) - # 平衡批次(如果配置了) + # 如果需要完整的批次平衡,在这里调用 if self.config.trainer.balance_batch: self._balance_batch(final_batch, metrics={}) - # 计算全局有效 token 数 - if "attention_mask" in final_batch.batch: - final_batch.meta_info["global_token_num"] = torch.sum(final_batch.batch["attention_mask"], dim=-1).tolist() - - # 收集统计信息和元数据(直接从 RolloutSample 中获取) - param_versions = [rs.param_version for rs in rollout_samples] - sample_timestamps = [rs.generation_timestamp for rs in rollout_samples] - - # 创建 meta_info - final_batch.meta_info.update( - { - "rollout_param_versions": param_versions, - "sample_timestamps": sample_timestamps, - "avg_processing_time": np.mean(processing_times) if processing_times else 0, - "max_processing_time": np.max(processing_times) if processing_times else 0, - "param_version_diversity": len(set(param_versions)) if param_versions else 0, - "avg_sample_age": np.mean([time.time() - ts for ts in sample_timestamps]) if sample_timestamps else 0, - "assembly_time": time.time() - start_time, - } - ) - - print(f"[FullyAsyncTrainer] Batch assembly completed in {time.time() - start_time:.2f}s") print(f"[FullyAsyncTrainer] {final_batch}") - return final_batch def _create_actor_rollout_classes(self): @@ -336,10 +266,10 @@ def fit(self): # Use queue mode, no need for traditional dataloader iterator # Initialize to get the first batch of data while True: - metrics = {} + # metrics = {} timing_raw = {} - is_last_step = False + # is_last_step = False with marked_timer("step", timing_raw): with marked_timer("gen", timing_raw, color="red"): diff --git a/recipe/fully_async_policy/message_queue.py b/recipe/fully_async_policy/message_queue.py index 3ece118bc81..b2761f95749 100644 --- a/recipe/fully_async_policy/message_queue.py +++ b/recipe/fully_async_policy/message_queue.py @@ -15,7 +15,6 @@ import asyncio import logging from collections import deque -from dataclasses import dataclass from typing import Any import ray @@ -24,30 +23,6 @@ logger = logging.getLogger(__name__) -@dataclass -class RolloutSample: - """Enhanced rollout sample containing both original batch info and AgentLoopOutput""" - - # Original batch information (preserved from _prepare_generate_batch) - original_batch_dict: dict[str, Any] - - # AgentLoopOutput from generation - agent_loop_output: Any # AgentLoopOutput - - # Metadata - sample_id: str - epoch: int - rollout_n_index: int # Index within the rollout.n repetitions (0, 1, ..., n-1) - original_sample_index: int # Index of the original sample before repetition - - # Processing metadata - processing_time: float - generation_timestamp: float - param_version: int - - _gen_data: Any - - @ray.remote(num_cpus=2, max_concurrency=20) class MessageQueue: """ @@ -107,9 +82,9 @@ async def put_sample(self, sample: Any, param_version: int) -> bool: # If queue is full, remove the oldest sample (rarely happens) if len(self.queue) >= self.max_queue_size: - removed = self.queue.popleft() + self.queue.popleft() self.dropped_samples += 1 - logger.warning(f"Queue full, dropped sample") + logger.warning("Queue full, dropped sample") self.queue.append(sample) self.total_produced += 1 diff --git a/recipe/fully_async_policy/unittest/test_batch_utils.py b/recipe/fully_async_policy/unittest/test_batch_utils.py new file mode 100644 index 00000000000..c2593f83ec7 --- /dev/null +++ b/recipe/fully_async_policy/unittest/test_batch_utils.py @@ -0,0 +1,321 @@ +#!/usr/bin/env python3 + +# Copyright 2025 Meituan Ltd. and/or its affiliates +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import os +import sys +import time +import unittest +from dataclasses import dataclass +from unittest.mock import MagicMock + +import numpy as np +import torch + +sys.path.append(os.path.join(os.path.dirname(__file__), "..", "..", "..")) + +from recipe.fully_async_policy.batch_utils import assemble_batch_from_rollout_samples +from recipe.fully_async_policy.message_queue import RolloutSample +from verl import DataProto + + +@dataclass +class MockAgentLoopOutput: + """Mock AgentLoopOutput for testing""" + + prompt_ids: list[int] + response_ids: list[int] + response_mask: list[int] + num_turns: int = 1 + metrics: dict = None + + def __post_init__(self): + if self.metrics is None: + self.metrics = {} + + +class MockConfig: + """Mock configuration object""" + + def __init__(self): + self.trainer = MockTrainerConfig() + + +class MockTrainerConfig: + """Mock trainer configuration""" + + def __init__(self): + self.balance_batch = False + + +class TestBatchUtils(unittest.TestCase): + def setUp(self): + """设置测试环境""" + self.tokenizer = MagicMock() + self.config = MockConfig() + + # Mock postprocess_agent_loop_outputs function + self.mock_postprocess = MagicMock() + + # Patch the postprocess function + import recipe.fully_async_policy.batch_utils as batch_utils_module + + self.original_postprocess = batch_utils_module.postprocess_agent_loop_outputs + batch_utils_module.postprocess_agent_loop_outputs = self.mock_postprocess + + # Mock compute_response_mask function + self.original_compute_response_mask = batch_utils_module.compute_response_mask + batch_utils_module.compute_response_mask = MagicMock(return_value=torch.ones(2, 128, dtype=torch.int64)) + + def tearDown(self): + """清理测试环境""" + import recipe.fully_async_policy.batch_utils as batch_utils_module + + batch_utils_module.postprocess_agent_loop_outputs = self.original_postprocess + batch_utils_module.compute_response_mask = self.original_compute_response_mask + + def create_mock_rollout_sample(self, sample_id: str, param_version: int = 1) -> RolloutSample: + """创建测试用的 RolloutSample""" + # 创建 mock AgentLoopOutput + agent_loop_output = MockAgentLoopOutput( + prompt_ids=[151644, 8948, 198] + list(range(100)), # 简化的prompt_ids + response_ids=[14374, 14822] + list(range(50)), # 简化的response_ids + response_mask=[1] * 52, # response_mask + num_turns=1, + metrics={"generate_time": 0.5}, + ) + + # 创建原始batch信息 + original_batch_dict = { + "batch": {}, # 空的tensor batch用于测试 + "non_tensor_batch": { + "data_source": np.array(["openai/gsm8k"], dtype=object), + "ability": np.array(["math"], dtype=object), + "reward_model": np.array([{"ground_truth": "6", "style": "rule"}], dtype=object), + "extra_info": np.array( + [{"answer": "test answer", "index": 4570, "question": "test question", "split": "train"}], + dtype=object, + ), + "raw_prompt_ids": np.array([[151644, 8948, 198]], dtype=object), + "raw_prompt": np.array([[{"content": "test content", "role": "user"}]], dtype=object), + "tools_kwargs": np.array([{}], dtype=object), + "interaction_kwargs": np.array([{}], dtype=object), + "index": np.array([4570], dtype=object), + }, + "meta_info": {"global_steps": 1}, + } + + return RolloutSample( + original_batch_dict=original_batch_dict, + agent_loop_output=agent_loop_output, + sample_id=sample_id, + epoch=0, + rollout_n_index=0, + original_sample_index=0, + processing_time=0.5, + generation_timestamp=time.time(), + param_version=param_version, + _gen_data=None, + ) + + def test_assemble_batch_empty_input(self): + """测试空输入的情况""" + with self.assertRaises(ValueError) as context: + assemble_batch_from_rollout_samples([], self.tokenizer, self.config) + + self.assertIn("Empty rollout_samples", str(context.exception)) + + def test_assemble_batch_single_sample(self): + """测试单个样本的批次组装""" + # 设置mock返回值 + mock_gen_batch = DataProto( + batch=torch.nn.utils.rnn.pad_sequence( + [ + torch.tensor([151644, 8948, 198] + list(range(100))), + ], + batch_first=True, + padding_value=0, + ), + non_tensor_batch={"__test_key": np.array(["test_value"], dtype=object)}, + meta_info={"test_meta": "test_value"}, + ) + self.mock_postprocess.return_value = mock_gen_batch + + # 创建测试样本 + rollout_samples = [self.create_mock_rollout_sample("sample_1")] + + # 调用函数 + result = assemble_batch_from_rollout_samples( + rollout_samples=rollout_samples, tokenizer=self.tokenizer, config=self.config + ) + + # 验证结果 + self.assertIsInstance(result, DataProto) + self.assertIn("uid", result.non_tensor_batch) + self.assertEqual(result.non_tensor_batch["uid"][0], "uid_sample_1") + + # 验证meta_info包含预期字段 + expected_fields = [ + "rollout_param_versions", + "sample_timestamps", + "avg_processing_time", + "max_processing_time", + "param_version_diversity", + "avg_sample_age", + "assembly_time", + ] + for field in expected_fields: + self.assertIn(field, result.meta_info) + + # 验证统计信息 + self.assertEqual(result.meta_info["rollout_param_versions"], [1]) + self.assertEqual(result.meta_info["avg_processing_time"], 0.5) + self.assertEqual(result.meta_info["param_version_diversity"], 1) + + def test_assemble_batch_multiple_samples(self): + """测试多个样本的批次组装""" + # 设置mock返回值 + mock_gen_batch = DataProto( + batch=torch.nn.utils.rnn.pad_sequence( + [ + torch.tensor([151644, 8948, 198] + list(range(100))), + torch.tensor([151644, 8948, 198] + list(range(90))), + ], + batch_first=True, + padding_value=0, + ), + non_tensor_batch={"__test_key": np.array(["test_value1", "test_value2"], dtype=object)}, + meta_info={"test_meta": "test_value"}, + ) + self.mock_postprocess.return_value = mock_gen_batch + + # 创建测试样本 + rollout_samples = [ + self.create_mock_rollout_sample("sample_1", param_version=1), + self.create_mock_rollout_sample("sample_2", param_version=2), + ] + + # 调用函数 + result = assemble_batch_from_rollout_samples( + rollout_samples=rollout_samples, tokenizer=self.tokenizer, config=self.config + ) + + # 验证结果 + self.assertIsInstance(result, DataProto) + self.assertEqual(len(result.non_tensor_batch["uid"]), 2) + self.assertListEqual(list(result.non_tensor_batch["uid"]), ["uid_sample_1", "uid_sample_2"]) + + # 验证多样本统计 + self.assertEqual(result.meta_info["rollout_param_versions"], [1, 2]) + self.assertEqual(result.meta_info["param_version_diversity"], 2) # 两个不同版本 + self.assertEqual(result.meta_info["avg_processing_time"], 0.5) + + def test_assemble_batch_with_balance_batch_flag(self): + """测试启用balance_batch标志的情况""" + # 设置mock返回值 + mock_gen_batch = DataProto( + batch=torch.nn.utils.rnn.pad_sequence( + [ + torch.tensor([151644, 8948, 198] + list(range(100))), + ], + batch_first=True, + padding_value=0, + ), + non_tensor_batch={"__test_key": np.array(["test_value"], dtype=object)}, + meta_info={"test_meta": "test_value"}, + ) + self.mock_postprocess.return_value = mock_gen_batch + + # 设置config启用balance_batch + self.config.trainer.balance_batch = True + + # 创建测试样本 + rollout_samples = [self.create_mock_rollout_sample("sample_1")] + + # 调用函数 + result = assemble_batch_from_rollout_samples( + rollout_samples=rollout_samples, tokenizer=self.tokenizer, config=self.config, balance_batch=True + ) + + # 验证结果(主要验证没有抛出异常) + self.assertIsInstance(result, DataProto) + + def test_assemble_batch_attention_mask_processing(self): + """测试attention_mask处理逻辑""" + # 设置mock返回值,包含attention_mask + mock_gen_batch = DataProto( + batch={ + "attention_mask": torch.ones(2, 128, dtype=torch.int64), + "input_ids": torch.randint(0, 1000, (2, 128)), + }, + non_tensor_batch={"__test_key": np.array(["test_value1", "test_value2"], dtype=object)}, + meta_info={"test_meta": "test_value"}, + ) + self.mock_postprocess.return_value = mock_gen_batch + + # 创建测试样本 + rollout_samples = [ + self.create_mock_rollout_sample("sample_1"), + self.create_mock_rollout_sample("sample_2"), + ] + + # 调用函数 + result = assemble_batch_from_rollout_samples( + rollout_samples=rollout_samples, tokenizer=self.tokenizer, config=self.config + ) + + # 验证global_token_num被正确计算 + self.assertIn("global_token_num", result.meta_info) + self.assertIsInstance(result.meta_info["global_token_num"], list) + + def test_mock_postprocess_called_correctly(self): + """测试postprocess_agent_loop_outputs被正确调用""" + # 设置mock返回值 + mock_gen_batch = DataProto( + batch=torch.nn.utils.rnn.pad_sequence( + [ + torch.tensor([151644, 8948, 198] + list(range(100))), + ], + batch_first=True, + padding_value=0, + ), + non_tensor_batch={"__test_key": np.array(["test_value"], dtype=object)}, + meta_info={"test_meta": "test_value"}, + ) + self.mock_postprocess.return_value = mock_gen_batch + + # 创建测试样本 + rollout_samples = [self.create_mock_rollout_sample("sample_1")] + + # 调用函数 + result = assemble_batch_from_rollout_samples( + rollout_samples=rollout_samples, tokenizer=self.tokenizer, config=self.config + ) + + print(result) + + # 验证postprocess_agent_loop_outputs被调用 + self.mock_postprocess.assert_called_once() + call_args = self.mock_postprocess.call_args + + # 验证调用参数 + agent_loop_outputs, tokenizer, config = call_args[0] + self.assertEqual(len(agent_loop_outputs), 1) + self.assertEqual(tokenizer, self.tokenizer) + self.assertEqual(config, self.config) + + +if __name__ == "__main__": + unittest.main() diff --git a/recipe/fully_async_policy/utils.py b/recipe/fully_async_policy/utils.py index d9afa0a9ab1..a2e7d5e6c4c 100644 --- a/recipe/fully_async_policy/utils.py +++ b/recipe/fully_async_policy/utils.py @@ -11,10 +11,34 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. +from dataclasses import dataclass +from typing import Any # Calculate the number of samples needed - - def calculate_one_step_size(minimal_bsz, ppo_mini_batch_size): return minimal_bsz * ppo_mini_batch_size + + +@dataclass +class RolloutSample: + """Enhanced rollout sample containing both original batch info and AgentLoopOutput""" + + # Original batch information (preserved from _prepare_generate_batch) + original_batch_dict: dict[str, Any] + + # AgentLoopOutput from generation + agent_loop_output: Any # AgentLoopOutput + + # Metadata + sample_id: str + epoch: int + rollout_n_index: int # Index within the rollout.n repetitions (0, 1, ..., n-1) + original_sample_index: int # Index of the original sample before repetition + + # Processing metadata + processing_time: float + generation_timestamp: float + param_version: int + + _gen_data: Any From 936a6720240e4963e1777452f71acfcad5b28c1f Mon Sep 17 00:00:00 2001 From: ArronHZG Date: Sat, 16 Aug 2025 01:02:07 +0800 Subject: [PATCH 053/182] assemble_batch_from_rollout_samples --- .../{batch_utils.py => detach_utils.py} | 95 ++- .../fully_async_rollouter.py | 62 +- .../fully_async_policy/fully_async_trainer.py | 43 +- .../unittest/test_batch_utils.py | 675 +++++++++++++----- recipe/fully_async_policy/utils.py | 44 -- 5 files changed, 582 insertions(+), 337 deletions(-) rename recipe/fully_async_policy/{batch_utils.py => detach_utils.py} (63%) delete mode 100644 recipe/fully_async_policy/utils.py diff --git a/recipe/fully_async_policy/batch_utils.py b/recipe/fully_async_policy/detach_utils.py similarity index 63% rename from recipe/fully_async_policy/batch_utils.py rename to recipe/fully_async_policy/detach_utils.py index 806dd9e1579..202cfdcf783 100644 --- a/recipe/fully_async_policy/batch_utils.py +++ b/recipe/fully_async_policy/detach_utils.py @@ -11,20 +11,79 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. - import time +from dataclasses import dataclass +from typing import Any import numpy as np import torch -from recipe.fully_async_policy.utils import RolloutSample from verl import DataProto from verl.experimental.agent_loop.agent_loop import postprocess_agent_loop_outputs from verl.trainer.ppo.ray_trainer import compute_response_mask +# Calculate the number of samples needed +def calculate_one_step_size(minimal_bsz, ppo_mini_batch_size): + return minimal_bsz * ppo_mini_batch_size + + +@dataclass +class RolloutSample: + """Enhanced rollout sample containing both original batch info and AgentLoopOutput""" + + # Original batch information + full_batch: Any + + # AgentLoopOutput from generation + agent_loop_output: Any # AgentLoopOutput + + # Metadata + sample_id: str + epoch: int + rollout_n_index: int # Index within the rollout.n repetitions (0, 1, ..., n-1) + original_sample_index: int # Index of the original sample before repetition + + # Processing metadata + processing_time: float + generation_timestamp: float + param_version: int + + +def prepare_single_generation_data(batch_dict, global_steps) -> DataProto: + """ + 类似 ray_trainer._prepare_generate_batch 的逻辑,但针对单个样本 + 分离出用于生成的数据和需要保留的原始数据 + + Returns: + tuple: (original_batch_dict, gen_data_for_single_sample) + """ + + # 创建完整的 DataProto + full_batch = DataProto.from_single_dict(batch_dict) + + # batch : TensorDict { input_ids, attention_mask, position_ids} + # non_tensor_batch: raw_prompt_ids, raw_prompt, + # multi_modal_data, tools_kwargs, interaction_kwargs, index, agent_name, + # data_source, ability, reward_model + # meta_info: {} + + # 定义需要传递给生成服务器的字段 + batch_keys_to_pop = ["input_ids", "attention_mask", "position_ids"] + non_tensor_batch_keys_to_pop = ["raw_prompt_ids"] + + full_batch.pop( + batch_keys=batch_keys_to_pop, + non_tensor_batch_keys=non_tensor_batch_keys_to_pop, + ) + # 添加全局步数到生成数据 + full_batch.meta_info["global_steps"] = global_steps + + return full_batch + + def assemble_batch_from_rollout_samples( - rollout_samples: list[RolloutSample], tokenizer, config, balance_batch: bool = False + rollout_samples: list[RolloutSample], tokenizer, config, balance_batch=None ) -> DataProto: """ Assemble gen_batch_output from RolloutSample objects @@ -60,17 +119,11 @@ def assemble_batch_from_rollout_samples( # 每个 RolloutSample 都是独立的,直接按顺序重建原始数据 original_batch_list = [] for rs in rollout_samples: - original_batch_dict = rs.original_batch_dict - - # 重建 DataProto - original_batch_item = DataProto.from_single_dict( - { - **{k: v for k, v in original_batch_dict["batch"].items()}, - **{f"__{k}": v for k, v in original_batch_dict["non_tensor_batch"].items()}, - } - ) - original_batch_item.meta_info.update(original_batch_dict["meta_info"]) - original_batch_list.append(original_batch_item) + item = rs.full_batch.to_items()[0] + original_batch_list.append(item) + + print("=" * 300) + print(original_batch_list) # 合并所有原始样本为一个批次 if original_batch_list: @@ -79,6 +132,9 @@ def assemble_batch_from_rollout_samples( # 如果没有原始数据,创建空的 DataProto original_batch = DataProto.from_single_dict({}) + print("=" * 300) + print(original_batch) + # 添加 UID uids = [] for rs in rollout_samples: @@ -87,16 +143,21 @@ def assemble_batch_from_rollout_samples( # 直接合并原始数据和生成结果,不需要 repeat # 因为队列中的每个 RolloutSample 都已经是独立的样本 - final_batch = original_batch.union(gen_batch_output) + if original_batch.batch is None: + final_batch = gen_batch_output + # 将 original_batch 的 non_tensor_batch 和 meta_info 合并到 final_batch + for key, value in original_batch.non_tensor_batch.items(): + final_batch.non_tensor_batch[key] = value + final_batch.meta_info.update(original_batch.meta_info) # 计算 response_mask(如果不存在) if "response_mask" not in final_batch.batch.keys(): final_batch.batch["response_mask"] = compute_response_mask(final_batch) # 简化的批次平衡逻辑(如果需要的话) - if balance_batch and hasattr(config, "trainer") and getattr(config.trainer, "balance_batch", False): + if balance_batch: # 注意:这里简化了批次平衡逻辑,如果需要完整功能需要额外参数 - print("[BatchUtils] Batch balancing requested but simplified in static function") + balance_batch(final_batch, metrics={}) # 计算全局有效 token 数 if "attention_mask" in final_batch.batch: diff --git a/recipe/fully_async_policy/fully_async_rollouter.py b/recipe/fully_async_policy/fully_async_rollouter.py index f166582ef73..888068b12b6 100644 --- a/recipe/fully_async_policy/fully_async_rollouter.py +++ b/recipe/fully_async_policy/fully_async_rollouter.py @@ -18,8 +18,12 @@ import ray from omegaconf import OmegaConf +from recipe.fully_async_policy.detach_utils import ( + RolloutSample, + calculate_one_step_size, + prepare_single_generation_data, +) from recipe.fully_async_policy.message_queue import MessageQueueClient -from recipe.fully_async_policy.utils import RolloutSample, calculate_one_step_size from verl.single_controller.ray import RayClassWithInitArgs, RayWorkerGroup from verl.trainer.ppo.ray_trainer import RayPPOTrainer, ResourcePoolManager, Role, WorkerType from verl.utils.tracking import ValidationGenerationsLogger @@ -213,55 +217,6 @@ def _init_async_rollout_manager(self): worker_group=self.rollout_wg, ) - def _prepare_single_generation_data(self, batch_dict): - """ - 类似 ray_trainer._prepare_generate_batch 的逻辑,但针对单个样本 - 分离出用于生成的数据和需要保留的原始数据 - - Returns: - tuple: (original_batch_dict, gen_data_for_single_sample) - """ - from verl import DataProto - - # 创建完整的 DataProto - full_batch = DataProto.from_single_dict(batch_dict) - - # 定义需要传递给生成服务器的字段 - batch_keys_to_pop = ["input_ids", "attention_mask", "position_ids"] - non_tensor_batch_keys_to_pop = ["raw_prompt_ids"] - - # 处理可选字段 - optional_fields = [ - "multi_modal_data", - "raw_prompt", - "tools_kwargs", - "interaction_kwargs", - "index", - "agent_name", - ] - - for field in optional_fields: - if field in full_batch.non_tensor_batch: - non_tensor_batch_keys_to_pop.append(field) - - # 分离数据:gen_batch 用于生成,original_batch 保留原始信息 - gen_batch = full_batch.pop( - batch_keys=batch_keys_to_pop, - non_tensor_batch_keys=non_tensor_batch_keys_to_pop, - ) - - # 添加全局步数到生成数据 - gen_batch.meta_info["global_steps"] = self.global_steps - - # 保留原始 batch 信息(转换为字典格式以便序列化) - original_batch_dict = { - "batch": {k: v.clone() if hasattr(v, "clone") else v for k, v in full_batch.batch.items()}, - "non_tensor_batch": dict(full_batch.non_tensor_batch), - "meta_info": dict(full_batch.meta_info), - } - - return original_batch_dict, gen_batch - # 添加样本到待处理队列的协程 async def _feed_samples(self): continuous_iterator = self._create_continuous_iterator() @@ -273,7 +228,7 @@ async def _feed_samples(self): break # 类似 _prepare_generate_batch 的逻辑:分离数据 - original_batch, gen_data = self._prepare_single_generation_data(batch_dict) + full_batch = prepare_single_generation_data(batch_dict, self.global_steps) # 根据 rollout.n 进行重复 for rollout_n_index in range(self.config.actor_rollout_ref.rollout.n): @@ -281,7 +236,7 @@ async def _feed_samples(self): # 创建部分 RolloutSample,不包含 _gen_data(因为它不在数据类定义中) partial_rollout_sample = RolloutSample( - original_batch_dict=original_batch, + full_batch=full_batch, agent_loop_output=None, # 待处理后填充 sample_id=sample_id, epoch=epoch, @@ -290,7 +245,6 @@ async def _feed_samples(self): processing_time=0.0, # 待处理后填充 generation_timestamp=0.0, # 待处理后填充 param_version=0, # 待处理后填充 - _gen_data=gen_data, ) await self.pending_queue.put(partial_rollout_sample) @@ -362,7 +316,7 @@ async def _process_single_sample_streaming(self, partial_rollout_sample): # 调用异步生成方法 agent_loop_output, processing_time = await self.async_rollout_manager.generate_single_sample_async( - partial_rollout_sample._gen_data, partial_rollout_sample.sample_id + partial_rollout_sample.full_batch, partial_rollout_sample.sample_id ) # 直接更新 RolloutSample 对象,填充剩余字段 partial_rollout_sample.agent_loop_output = agent_loop_output diff --git a/recipe/fully_async_policy/fully_async_trainer.py b/recipe/fully_async_policy/fully_async_trainer.py index cbea37c4083..c9a495c60ed 100644 --- a/recipe/fully_async_policy/fully_async_trainer.py +++ b/recipe/fully_async_policy/fully_async_trainer.py @@ -21,8 +21,12 @@ import ray from omegaconf import OmegaConf +from recipe.fully_async_policy.detach_utils import ( + RolloutSample, + assemble_batch_from_rollout_samples, + calculate_one_step_size, +) from recipe.fully_async_policy.message_queue import MessageQueueClient -from recipe.fully_async_policy.utils import RolloutSample, calculate_one_step_size from verl.single_controller.ray import RayClassWithInitArgs, RayWorkerGroup from verl.trainer.ppo import core_algos from verl.trainer.ppo.core_algos import AdvantageEstimator @@ -167,39 +171,12 @@ def _get_samples_from_queue(self) -> tuple[None, None] | tuple[int, Any]: queue_samples = [ray.cloudpickle.loads(x) for x in queue_samples] print(queue_samples) # Assemble batch - now working directly with RolloutSample objects - batch = self._assemble_gen_batch_output_from_queue_samples(queue_samples) - print(f" _assemble_gen_batch_output_from_queue_samples {batch}") - return 0, queue_samples - # - # return 0, batch - - def _assemble_gen_batch_output_from_queue_samples(self, rollout_samples: list[RolloutSample]): - """ - Assemble gen_batch_output from RolloutSample objects - 从 RolloutSample 对象中组装批次,类似 ray_trainer 的 _post_generate_batch 逻辑 - - Args: - rollout_samples: List of RolloutSample objects - - Returns: - DataProto: Assembled gen_batch_output - """ - from recipe.fully_async_policy.batch_utils import assemble_batch_from_rollout_samples - - # 使用静态函数进行批次组装 - final_batch = assemble_batch_from_rollout_samples( - rollout_samples=rollout_samples, - tokenizer=self.tokenizer, - config=self.config, - balance_batch=False, # 不使用静态函数的简化版本 - ) - - # 如果需要完整的批次平衡,在这里调用 if self.config.trainer.balance_batch: - self._balance_batch(final_batch, metrics={}) - - print(f"[FullyAsyncTrainer] {final_batch}") - return final_batch + batch = assemble_batch_from_rollout_samples(queue_samples, self._balance_batch) + else: + batch = assemble_batch_from_rollout_samples(queue_samples) + print(f" _assemble_gen_batch_output_from_queue_samples {batch}") + return 0, batch def _create_actor_rollout_classes(self): # create actor diff --git a/recipe/fully_async_policy/unittest/test_batch_utils.py b/recipe/fully_async_policy/unittest/test_batch_utils.py index c2593f83ec7..ddde3a4ad92 100644 --- a/recipe/fully_async_policy/unittest/test_batch_utils.py +++ b/recipe/fully_async_policy/unittest/test_batch_utils.py @@ -23,14 +23,22 @@ import numpy as np import torch +from tensordict import TensorDict sys.path.append(os.path.join(os.path.dirname(__file__), "..", "..", "..")) -from recipe.fully_async_policy.batch_utils import assemble_batch_from_rollout_samples -from recipe.fully_async_policy.message_queue import RolloutSample +from recipe.fully_async_policy.detach_utils import RolloutSample, assemble_batch_from_rollout_samples from verl import DataProto +@dataclass +class MockAgentLoopMetrics: + """Mock AgentLoopMetrics for testing""" + + generate_sequences: float = 0.5 + tool_calls: float = 0.0 + + @dataclass class MockAgentLoopOutput: """Mock AgentLoopOutput for testing""" @@ -39,11 +47,11 @@ class MockAgentLoopOutput: response_ids: list[int] response_mask: list[int] num_turns: int = 1 - metrics: dict = None + metrics: MockAgentLoopMetrics = None def __post_init__(self): if self.metrics is None: - self.metrics = {} + self.metrics = MockAgentLoopMetrics() class MockConfig: @@ -70,131 +78,406 @@ def setUp(self): self.mock_postprocess = MagicMock() # Patch the postprocess function - import recipe.fully_async_policy.batch_utils as batch_utils_module + import recipe.fully_async_policy.detach_utils as detach_utils_module - self.original_postprocess = batch_utils_module.postprocess_agent_loop_outputs - batch_utils_module.postprocess_agent_loop_outputs = self.mock_postprocess + self.original_postprocess = detach_utils_module.postprocess_agent_loop_outputs + detach_utils_module.postprocess_agent_loop_outputs = self.mock_postprocess # Mock compute_response_mask function - self.original_compute_response_mask = batch_utils_module.compute_response_mask - batch_utils_module.compute_response_mask = MagicMock(return_value=torch.ones(2, 128, dtype=torch.int64)) + self.original_compute_response_mask = detach_utils_module.compute_response_mask + detach_utils_module.compute_response_mask = MagicMock(return_value=torch.ones(2, 128, dtype=torch.int64)) def tearDown(self): """清理测试环境""" - import recipe.fully_async_policy.batch_utils as batch_utils_module + import recipe.fully_async_policy.detach_utils as detach_utils_module - batch_utils_module.postprocess_agent_loop_outputs = self.original_postprocess - batch_utils_module.compute_response_mask = self.original_compute_response_mask + detach_utils_module.postprocess_agent_loop_outputs = self.original_postprocess + detach_utils_module.compute_response_mask = self.original_compute_response_mask def create_mock_rollout_sample(self, sample_id: str, param_version: int = 1) -> RolloutSample: """创建测试用的 RolloutSample""" # 创建 mock AgentLoopOutput agent_loop_output = MockAgentLoopOutput( - prompt_ids=[151644, 8948, 198] + list(range(100)), # 简化的prompt_ids - response_ids=[14374, 14822] + list(range(50)), # 简化的response_ids - response_mask=[1] * 52, # response_mask - num_turns=1, - metrics={"generate_time": 0.5}, + prompt_ids=[ + 151644, + 8948, + 198, + 2610, + 525, + 1207, + 16948, + 11, + 3465, + 553, + 54364, + 14817, + 13, + 1446, + 525, + 264, + 10950, + 17847, + 13, + 151645, + 198, + 151644, + 872, + 198, + 24732, + 21189, + 264, + 400, + 16, + 17, + 40358, + 817, + 2254, + 13, + 758, + 279, + 1156, + 2003, + 11, + 566, + 37102, + 264, + 4843, + 315, + 432, + 26, + 304, + 279, + 2086, + 2003, + 11, + 566, + 37102, + 264, + 8338, + 315, + 1128, + 566, + 702, + 2115, + 13, + 2585, + 1753, + 3220, + 1558, + 566, + 614, + 2115, + 311, + 6248, + 279, + 2254, + 30, + 6771, + 594, + 1744, + 3019, + 553, + 3019, + 323, + 2550, + 279, + 1590, + 4226, + 1283, + 330, + 820, + 3263, + 151645, + 198, + 151644, + 77091, + 198, + ], + response_ids=[ + 14374, + 14822, + 14319, + 12, + 8304, + 74216, + 510, + 16, + 13, + 4127, + 40358, + 25, + 400, + 16, + 17, + 198, + 17, + 13, + 5512, + 2003, + 18024, + 510, + 262, + 481, + 8364, + 37102, + 264, + 4843, + 315, + 279, + 400, + 16, + 17, + 624, + 262, + 481, + 25783, + 7391, + 284, + 57960, + 37018, + 90, + 16, + 15170, + 18, + 92, + 1124, + 15136, + 32882, + 16, + 17, + 284, + 32882, + 19, + 66426, + 18, + 13, + 10657, + 3311, + 1283, + 1156, + 2003, + 25, + 400, + 16, + 17, + 481, + 32882, + 19, + 284, + 32882, + 23, + 66426, + 19, + 13, + 10440, + 2003, + 18024, + 510, + 262, + 481, + 8364, + 37102, + 264, + 8338, + 315, + 279, + 9664, + 3311, + 1283, + 279, + 1156, + 2003, + 624, + 262, + 481, + 11487, + 2115, + 284, + 400, + 23, + 481, + 400, + 19, + 284, + 400, + 19, + 198, + 262, + 481, + 25783, + 7391, + 2049, + 57960, + 37018, + 90, + 16, + 15170, + 19, + 92, + 1124, + 15136, + 32882, + 19, + 284, + 32882, + 16, + 66426, + 20, + 13, + 13023, + 3311, + 2115, + 510, + 262, + 481, + 8364, + 702, + 3322, + 369, + 264, + 2480, + 2003, + 311, + 6248, + 279, + 2254, + 2041, + 32821, + 894, + 803, + 40358, + 382, + 43434, + 510, + 24732, + 702, + 3070, + 65039, + 23, + 334, + 2115, + 13, + 1260, + 686, + 614, + 3322, + 3220, + 311, + 6248, + 279, + 2254, + 2041, + 32821, + 894, + 803, + 40358, + 13, + 151645, + ], + response_mask=[1] * 175, # 真实的response长度 + num_turns=2, + metrics=MockAgentLoopMetrics(generate_sequences=1.6468379497528076, tool_calls=0.0), ) - # 创建原始batch信息 - original_batch_dict = { - "batch": {}, # 空的tensor batch用于测试 - "non_tensor_batch": { - "data_source": np.array(["openai/gsm8k"], dtype=object), - "ability": np.array(["math"], dtype=object), - "reward_model": np.array([{"ground_truth": "6", "style": "rule"}], dtype=object), - "extra_info": np.array( - [{"answer": "test answer", "index": 4570, "question": "test question", "split": "train"}], + # 创建mock _gen_data + mock_gen_data = DataProto( + non_tensor_batch={ + "raw_prompt": np.array( + [ + [ + { + "content": "Tom receives a $12 allowance per month.", + "role": "user", + } + ] + ], dtype=object, ), - "raw_prompt_ids": np.array([[151644, 8948, 198]], dtype=object), - "raw_prompt": np.array([[{"content": "test content", "role": "user"}]], dtype=object), "tools_kwargs": np.array([{}], dtype=object), "interaction_kwargs": np.array([{}], dtype=object), "index": np.array([4570], dtype=object), }, - "meta_info": {"global_steps": 1}, - } + meta_info={"global_steps": 1}, + ) return RolloutSample( - original_batch_dict=original_batch_dict, + full_batch=mock_gen_data, agent_loop_output=agent_loop_output, sample_id=sample_id, epoch=0, rollout_n_index=0, original_sample_index=0, - processing_time=0.5, + processing_time=1.6468379497528076, generation_timestamp=time.time(), param_version=param_version, - _gen_data=None, ) - def test_assemble_batch_empty_input(self): - """测试空输入的情况""" - with self.assertRaises(ValueError) as context: - assemble_batch_from_rollout_samples([], self.tokenizer, self.config) - - self.assertIn("Empty rollout_samples", str(context.exception)) - - def test_assemble_batch_single_sample(self): - """测试单个样本的批次组装""" - # 设置mock返回值 - mock_gen_batch = DataProto( - batch=torch.nn.utils.rnn.pad_sequence( - [ - torch.tensor([151644, 8948, 198] + list(range(100))), - ], - batch_first=True, - padding_value=0, - ), - non_tensor_batch={"__test_key": np.array(["test_value"], dtype=object)}, - meta_info={"test_meta": "test_value"}, - ) - self.mock_postprocess.return_value = mock_gen_batch - - # 创建测试样本 - rollout_samples = [self.create_mock_rollout_sample("sample_1")] - - # 调用函数 - result = assemble_batch_from_rollout_samples( - rollout_samples=rollout_samples, tokenizer=self.tokenizer, config=self.config - ) - - # 验证结果 - self.assertIsInstance(result, DataProto) - self.assertIn("uid", result.non_tensor_batch) - self.assertEqual(result.non_tensor_batch["uid"][0], "uid_sample_1") - - # 验证meta_info包含预期字段 - expected_fields = [ - "rollout_param_versions", - "sample_timestamps", - "avg_processing_time", - "max_processing_time", - "param_version_diversity", - "avg_sample_age", - "assembly_time", - ] - for field in expected_fields: - self.assertIn(field, result.meta_info) - - # 验证统计信息 - self.assertEqual(result.meta_info["rollout_param_versions"], [1]) - self.assertEqual(result.meta_info["avg_processing_time"], 0.5) - self.assertEqual(result.meta_info["param_version_diversity"], 1) + # def test_assemble_batch_empty_input(self): + # """测试空输入的情况""" + # with self.assertRaises(ValueError) as context: + # assemble_batch_from_rollout_samples([], self.tokenizer, self.config) + # + # self.assertIn("Empty rollout_samples", str(context.exception)) + # + # def test_assemble_batch_single_sample(self): + # """测试单个样本的批次组装""" + # # 设置mock返回值 - 使用正确的TensorDict格式 + # mock_gen_batch = DataProto( + # batch=TensorDict({ + # "input_ids": torch.randint(0, 1000, (1, 256)), + # "attention_mask": torch.ones(1, 256, dtype=torch.int64), + # "position_ids": torch.arange(256).unsqueeze(0), + # "prompts": torch.randint(0, 1000, (1, 128)), + # "responses": torch.randint(0, 1000, (1, 128)), + # "response_mask": torch.ones(1, 128, dtype=torch.int64), + # }, batch_size=1), + # non_tensor_batch={"__test_key": np.array(["test_value"], dtype=object)}, + # meta_info={"test_meta": "test_value"} + # ) + # self.mock_postprocess.return_value = mock_gen_batch + # + # # 创建测试样本 + # rollout_samples = [self.create_mock_rollout_sample("sample_1")] + # + # # 调用函数 + # result = assemble_batch_from_rollout_samples( + # rollout_samples=rollout_samples, + # tokenizer=self.tokenizer, + # config=self.config + # ) + # + # # 验证结果 + # self.assertIsInstance(result, DataProto) + # self.assertIn("uid", result.non_tensor_batch) + # self.assertEqual(result.non_tensor_batch["uid"][0], "uid_sample_1") + # + # # 验证meta_info包含预期字段 + # expected_fields = [ + # "rollout_param_versions", "sample_timestamps", "avg_processing_time", + # "max_processing_time", "param_version_diversity", "avg_sample_age", "assembly_time" + # ] + # for field in expected_fields: + # self.assertIn(field, result.meta_info) + # + # # 验证统计信息 + # self.assertEqual(result.meta_info["rollout_param_versions"], [1]) + # self.assertAlmostEqual(result.meta_info["avg_processing_time"], 1.6468379497528076, places=5) + # self.assertEqual(result.meta_info["param_version_diversity"], 1) def test_assemble_batch_multiple_samples(self): """测试多个样本的批次组装""" - # 设置mock返回值 + # 设置mock返回值 - 使用正确的TensorDict格式 mock_gen_batch = DataProto( - batch=torch.nn.utils.rnn.pad_sequence( - [ - torch.tensor([151644, 8948, 198] + list(range(100))), - torch.tensor([151644, 8948, 198] + list(range(90))), - ], - batch_first=True, - padding_value=0, + batch=TensorDict( + { + "input_ids": torch.randint(0, 1000, (2, 256)), + "attention_mask": torch.ones(2, 256, dtype=torch.int64), + "position_ids": torch.arange(256).unsqueeze(0).repeat(2, 1), + "prompts": torch.randint(0, 1000, (2, 128)), + "responses": torch.randint(0, 1000, (2, 128)), + "response_mask": torch.ones(2, 128, dtype=torch.int64), + }, + batch_size=2, ), non_tensor_batch={"__test_key": np.array(["test_value1", "test_value2"], dtype=object)}, meta_info={"test_meta": "test_value"}, @@ -207,6 +490,8 @@ def test_assemble_batch_multiple_samples(self): self.create_mock_rollout_sample("sample_2", param_version=2), ] + print(rollout_samples) + # 调用函数 result = assemble_batch_from_rollout_samples( rollout_samples=rollout_samples, tokenizer=self.tokenizer, config=self.config @@ -220,101 +505,113 @@ def test_assemble_batch_multiple_samples(self): # 验证多样本统计 self.assertEqual(result.meta_info["rollout_param_versions"], [1, 2]) self.assertEqual(result.meta_info["param_version_diversity"], 2) # 两个不同版本 - self.assertEqual(result.meta_info["avg_processing_time"], 0.5) - - def test_assemble_batch_with_balance_batch_flag(self): - """测试启用balance_batch标志的情况""" - # 设置mock返回值 - mock_gen_batch = DataProto( - batch=torch.nn.utils.rnn.pad_sequence( - [ - torch.tensor([151644, 8948, 198] + list(range(100))), - ], - batch_first=True, - padding_value=0, - ), - non_tensor_batch={"__test_key": np.array(["test_value"], dtype=object)}, - meta_info={"test_meta": "test_value"}, - ) - self.mock_postprocess.return_value = mock_gen_batch - - # 设置config启用balance_batch - self.config.trainer.balance_batch = True - - # 创建测试样本 - rollout_samples = [self.create_mock_rollout_sample("sample_1")] - - # 调用函数 - result = assemble_batch_from_rollout_samples( - rollout_samples=rollout_samples, tokenizer=self.tokenizer, config=self.config, balance_batch=True - ) - - # 验证结果(主要验证没有抛出异常) - self.assertIsInstance(result, DataProto) - - def test_assemble_batch_attention_mask_processing(self): - """测试attention_mask处理逻辑""" - # 设置mock返回值,包含attention_mask - mock_gen_batch = DataProto( - batch={ - "attention_mask": torch.ones(2, 128, dtype=torch.int64), - "input_ids": torch.randint(0, 1000, (2, 128)), - }, - non_tensor_batch={"__test_key": np.array(["test_value1", "test_value2"], dtype=object)}, - meta_info={"test_meta": "test_value"}, - ) - self.mock_postprocess.return_value = mock_gen_batch - - # 创建测试样本 - rollout_samples = [ - self.create_mock_rollout_sample("sample_1"), - self.create_mock_rollout_sample("sample_2"), - ] - - # 调用函数 - result = assemble_batch_from_rollout_samples( - rollout_samples=rollout_samples, tokenizer=self.tokenizer, config=self.config - ) - - # 验证global_token_num被正确计算 - self.assertIn("global_token_num", result.meta_info) - self.assertIsInstance(result.meta_info["global_token_num"], list) - - def test_mock_postprocess_called_correctly(self): - """测试postprocess_agent_loop_outputs被正确调用""" - # 设置mock返回值 - mock_gen_batch = DataProto( - batch=torch.nn.utils.rnn.pad_sequence( - [ - torch.tensor([151644, 8948, 198] + list(range(100))), - ], - batch_first=True, - padding_value=0, - ), - non_tensor_batch={"__test_key": np.array(["test_value"], dtype=object)}, - meta_info={"test_meta": "test_value"}, - ) - self.mock_postprocess.return_value = mock_gen_batch - - # 创建测试样本 - rollout_samples = [self.create_mock_rollout_sample("sample_1")] - - # 调用函数 - result = assemble_batch_from_rollout_samples( - rollout_samples=rollout_samples, tokenizer=self.tokenizer, config=self.config - ) - - print(result) - - # 验证postprocess_agent_loop_outputs被调用 - self.mock_postprocess.assert_called_once() - call_args = self.mock_postprocess.call_args - - # 验证调用参数 - agent_loop_outputs, tokenizer, config = call_args[0] - self.assertEqual(len(agent_loop_outputs), 1) - self.assertEqual(tokenizer, self.tokenizer) - self.assertEqual(config, self.config) + self.assertAlmostEqual(result.meta_info["avg_processing_time"], 1.6468379497528076, places=5) + + # def test_assemble_batch_with_balance_batch_flag(self): + # """测试启用balance_batch标志的情况""" + # # 设置mock返回值 - 使用正确的TensorDict格式 + # mock_gen_batch = DataProto( + # batch=TensorDict({ + # "input_ids": torch.randint(0, 1000, (1, 256)), + # "attention_mask": torch.ones(1, 256, dtype=torch.int64), + # "position_ids": torch.arange(256).unsqueeze(0), + # "prompts": torch.randint(0, 1000, (1, 128)), + # "responses": torch.randint(0, 1000, (1, 128)), + # "response_mask": torch.ones(1, 128, dtype=torch.int64), + # }, batch_size=1), + # non_tensor_batch={"__test_key": np.array(["test_value"], dtype=object)}, + # meta_info={"test_meta": "test_value"} + # ) + # self.mock_postprocess.return_value = mock_gen_batch + # + # # 设置config启用balance_batch + # self.config.trainer.balance_batch = True + # + # # 创建测试样本 + # rollout_samples = [self.create_mock_rollout_sample("sample_1")] + # + # # 调用函数 + # result = assemble_batch_from_rollout_samples( + # rollout_samples=rollout_samples, + # tokenizer=self.tokenizer, + # config=self.config, + # balance_batch=True + # ) + # + # # 验证结果(主要验证没有抛出异常) + # self.assertIsInstance(result, DataProto) + # + # def test_assemble_batch_attention_mask_processing(self): + # """测试attention_mask处理逻辑""" + # # 设置mock返回值 - 使用正确的TensorDict格式 + # mock_gen_batch = DataProto( + # batch=TensorDict({ + # "input_ids": torch.randint(0, 1000, (2, 256)), + # "attention_mask": torch.ones(2, 256, dtype=torch.int64), + # "position_ids": torch.arange(256).unsqueeze(0).repeat(2, 1), + # "prompts": torch.randint(0, 1000, (2, 128)), + # "responses": torch.randint(0, 1000, (2, 128)), + # "response_mask": torch.ones(2, 128, dtype=torch.int64), + # }, batch_size=2), + # non_tensor_batch={"__test_key": np.array(["test_value1", "test_value2"], dtype=object)}, + # meta_info={"test_meta": "test_value"} + # ) + # self.mock_postprocess.return_value = mock_gen_batch + # + # # 创建测试样本 + # rollout_samples = [ + # self.create_mock_rollout_sample("sample_1"), + # self.create_mock_rollout_sample("sample_2"), + # ] + # + # # 调用函数 + # result = assemble_batch_from_rollout_samples( + # rollout_samples=rollout_samples, + # tokenizer=self.tokenizer, + # config=self.config + # ) + # + # # 验证global_token_num被正确计算 + # self.assertIn("global_token_num", result.meta_info) + # self.assertIsInstance(result.meta_info["global_token_num"], list) + # + # def test_mock_postprocess_called_correctly(self): + # """测试postprocess_agent_loop_outputs被正确调用""" + # # 设置mock返回值 - 使用正确的TensorDict格式 + # mock_gen_batch = DataProto( + # batch=TensorDict({ + # "input_ids": torch.randint(0, 1000, (1, 256)), + # "attention_mask": torch.ones(1, 256, dtype=torch.int64), + # "position_ids": torch.arange(256).unsqueeze(0), + # "prompts": torch.randint(0, 1000, (1, 128)), + # "responses": torch.randint(0, 1000, (1, 128)), + # "response_mask": torch.ones(1, 128, dtype=torch.int64), + # }, batch_size=1), + # non_tensor_batch={"__test_key": np.array(["test_value"], dtype=object)}, + # meta_info={"test_meta": "test_value"} + # ) + # self.mock_postprocess.return_value = mock_gen_batch + # + # # 创建测试样本 + # rollout_samples = [self.create_mock_rollout_sample("sample_1")] + # + # # 调用函数 + # result = assemble_batch_from_rollout_samples( + # rollout_samples=rollout_samples, + # tokenizer=self.tokenizer, + # config=self.config + # ) + # + # # 验证postprocess_agent_loop_outputs被调用 + # self.mock_postprocess.assert_called_once() + # call_args = self.mock_postprocess.call_args + # + # # 验证调用参数 + # agent_loop_outputs, tokenizer, config = call_args[0] + # self.assertEqual(len(agent_loop_outputs), 1) + # self.assertEqual(tokenizer, self.tokenizer) + # self.assertEqual(config, self.config) + # if __name__ == "__main__": diff --git a/recipe/fully_async_policy/utils.py b/recipe/fully_async_policy/utils.py deleted file mode 100644 index a2e7d5e6c4c..00000000000 --- a/recipe/fully_async_policy/utils.py +++ /dev/null @@ -1,44 +0,0 @@ -# Copyright 2025 Meituan Ltd. and/or its affiliates -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -from dataclasses import dataclass -from typing import Any - - -# Calculate the number of samples needed -def calculate_one_step_size(minimal_bsz, ppo_mini_batch_size): - return minimal_bsz * ppo_mini_batch_size - - -@dataclass -class RolloutSample: - """Enhanced rollout sample containing both original batch info and AgentLoopOutput""" - - # Original batch information (preserved from _prepare_generate_batch) - original_batch_dict: dict[str, Any] - - # AgentLoopOutput from generation - agent_loop_output: Any # AgentLoopOutput - - # Metadata - sample_id: str - epoch: int - rollout_n_index: int # Index within the rollout.n repetitions (0, 1, ..., n-1) - original_sample_index: int # Index of the original sample before repetition - - # Processing metadata - processing_time: float - generation_timestamp: float - param_version: int - - _gen_data: Any From 7763c689122f3b9b2df9968f7f2f8053ccf4200e Mon Sep 17 00:00:00 2001 From: ArronHZG Date: Sat, 16 Aug 2025 01:40:27 +0800 Subject: [PATCH 054/182] train success --- .../fully_async_policy/fully_async_trainer.py | 90 +++++++++++-------- 1 file changed, 52 insertions(+), 38 deletions(-) diff --git a/recipe/fully_async_policy/fully_async_trainer.py b/recipe/fully_async_policy/fully_async_trainer.py index c9a495c60ed..07694498378 100644 --- a/recipe/fully_async_policy/fully_async_trainer.py +++ b/recipe/fully_async_policy/fully_async_trainer.py @@ -172,9 +172,9 @@ def _get_samples_from_queue(self) -> tuple[None, None] | tuple[int, Any]: print(queue_samples) # Assemble batch - now working directly with RolloutSample objects if self.config.trainer.balance_batch: - batch = assemble_batch_from_rollout_samples(queue_samples, self._balance_batch) + batch = assemble_batch_from_rollout_samples(queue_samples, self.tokenizer, self.config, self._balance_batch) else: - batch = assemble_batch_from_rollout_samples(queue_samples) + batch = assemble_batch_from_rollout_samples(queue_samples, self.tokenizer, self.config, None) print(f" _assemble_gen_batch_output_from_queue_samples {batch}") return 0, batch @@ -243,49 +243,63 @@ def fit(self): # Use queue mode, no need for traditional dataloader iterator # Initialize to get the first batch of data while True: - # metrics = {} + metrics = {} timing_raw = {} - # is_last_step = False - with marked_timer("step", timing_raw): with marked_timer("gen", timing_raw, color="red"): epoch, batch = self._get_samples_from_queue() if batch is None: break - # - # # 更新统计信息 - # self.processed_samples += len(batch) if isinstance(batch, list) else 1 - # - # # 从meta_info中获取参数版本信息 - # if hasattr(batch, "meta_info") and batch.meta_info: - # rollout_param_versions = batch.meta_info.get("rollout_param_versions", []) - # if rollout_param_versions: - # # 统计陈旧样本 - # stale_count = sum(1 for v in rollout_param_versions if self.current_param_version - v > 1) - # self.stale_samples_processed += stale_count - # - # # 添加新鲜度指标到metrics - # if rollout_param_versions: - # param_version_diversity = batch.meta_info.get("param_version_diversity", 0) - # avg_sample_age = batch.meta_info.get("avg_sample_age", 0) - # - # metrics.update( - # { - # "freshness/param_version_diversity": param_version_diversity, - # "freshness/avg_sample_age": avg_sample_age, - # "freshness/stale_samples_ratio": stale_count / len(rollout_param_versions) - # if rollout_param_versions - # else 0, - # "statistics/processed_samples": self.processed_samples, - # "statistics/stale_samples_processed": self.stale_samples_processed, - # "statistics/current_param_version": self.current_param_version, - # } - # ) - # batch, reward_extra_infos_dict = self._process_batch_common(batch, metrics, timing_raw) - # self._log_rollout(batch, reward_extra_infos_dict, timing_raw) - # self._check_save_checkpoint(is_last_step, timing_raw) - # + + # 更新统计信息 + self.processed_samples += len(batch) if isinstance(batch, list) else 1 + + # 从meta_info中获取参数版本信息 + if hasattr(batch, "meta_info") and batch.meta_info: + # meta_info={'metrics': [{'generate_sequences': 1.8240885734558105, 'tool_calls': 0.0}, + # {'generate_sequences': 2.5197629928588867, 'tool_calls': 0.0}, + # {'generate_sequences': 3.5084900856018066, 'tool_calls': 0.0}, + # {'generate_sequences': 2.4329097270965576, 'tool_calls': 0.0}, + # {'generate_sequences': 3.0567924976348877, 'tool_calls': 0.0}, + # {'generate_sequences': 4.271160840988159, 'tool_calls': 0.0}], + # 'global_steps': 22, + # 'global_token_num': [588, 517, 422, 406, 355, 288], + # 'rollout_param_versions': [0, 0, 0, 0, 0, 0], + # 'sample_timestamps': [1755278023.7771623, 1755278024.101492, 1755278024.3597627, + # 1755278024.4885263, 1755278025.1039019, 1755278025.555585], + # 'avg_processing_time': 2.935534119606018, + # 'max_processing_time': 4.271160840988159, + # 'param_version_diversity': 1, + # 'avg_sample_age': 1.0503787994384766, + # 'assembly_time': 0.05373978614807129}) + rollout_param_versions = batch.meta_info.get("rollout_param_versions", []) + if rollout_param_versions: + # 统计陈旧样本 + stale_count = sum(1 for v in rollout_param_versions if self.current_param_version - v > 1) + self.stale_samples_processed += stale_count + + # 添加新鲜度指标到metrics + if rollout_param_versions: + param_version_diversity = batch.meta_info.get("param_version_diversity", 0) + avg_sample_age = batch.meta_info.get("avg_sample_age", 0) + + metrics.update( + { + "freshness/param_version_diversity": param_version_diversity, + "freshness/avg_sample_age": avg_sample_age, + "freshness/stale_samples_ratio": stale_count / len(rollout_param_versions) + if rollout_param_versions + else 0, + "statistics/processed_samples": self.processed_samples, + "statistics/stale_samples_processed": self.stale_samples_processed, + "statistics/current_param_version": self.current_param_version, + } + ) + batch, reward_extra_infos_dict = self._process_batch_common(batch, metrics, timing_raw) + self._log_rollout(batch, reward_extra_infos_dict, timing_raw) + self._check_save_checkpoint(False, timing_raw) + # self._collect_metrics(batch, epoch, metrics, timing_raw) # Trigger parameter synchronization after training step From d8212d9d7f4ca167e6923a4c27674b38e9fc3096 Mon Sep 17 00:00:00 2001 From: ArronHZG Date: Mon, 18 Aug 2025 10:45:30 +0800 Subject: [PATCH 055/182] refactor log --- recipe/fully_async_policy/detach_utils.py | 10 ++++------ recipe/fully_async_policy/fully_async_rollouter.py | 4 ++++ recipe/fully_async_policy/fully_async_trainer.py | 4 ++-- tests/special_e2e/run_fully_async_policy.sh | 6 +++--- 4 files changed, 13 insertions(+), 11 deletions(-) diff --git a/recipe/fully_async_policy/detach_utils.py b/recipe/fully_async_policy/detach_utils.py index 202cfdcf783..a76f42d7362 100644 --- a/recipe/fully_async_policy/detach_utils.py +++ b/recipe/fully_async_policy/detach_utils.py @@ -122,8 +122,8 @@ def assemble_batch_from_rollout_samples( item = rs.full_batch.to_items()[0] original_batch_list.append(item) - print("=" * 300) - print(original_batch_list) + # print("=" * 300) + # print(original_batch_list) # 合并所有原始样本为一个批次 if original_batch_list: @@ -132,8 +132,8 @@ def assemble_batch_from_rollout_samples( # 如果没有原始数据,创建空的 DataProto original_batch = DataProto.from_single_dict({}) - print("=" * 300) - print(original_batch) + # print("=" * 300) + # print(original_batch) # 添加 UID uids = [] @@ -154,9 +154,7 @@ def assemble_batch_from_rollout_samples( if "response_mask" not in final_batch.batch.keys(): final_batch.batch["response_mask"] = compute_response_mask(final_batch) - # 简化的批次平衡逻辑(如果需要的话) if balance_batch: - # 注意:这里简化了批次平衡逻辑,如果需要完整功能需要额外参数 balance_batch(final_batch, metrics={}) # 计算全局有效 token 数 diff --git a/recipe/fully_async_policy/fully_async_rollouter.py b/recipe/fully_async_policy/fully_async_rollouter.py index 888068b12b6..939f7a45b93 100644 --- a/recipe/fully_async_policy/fully_async_rollouter.py +++ b/recipe/fully_async_policy/fully_async_rollouter.py @@ -134,6 +134,10 @@ def __init__( self.minimal_bsz, config.actor_rollout_ref.actor.ppo_mini_batch_size ) self.max_required_samples = self.required_samples * (self.staleness_threshold + 1) + print( + f"[FullyAsyncRollouter] required_samples : {self.required_samples} " + f"max_required_samples: {self.max_required_samples}" + ) # 单次最多扔一次迭代需要的样本 self.max_concurrent_samples = self.required_samples diff --git a/recipe/fully_async_policy/fully_async_trainer.py b/recipe/fully_async_policy/fully_async_trainer.py index 07694498378..ce8735bd8cc 100644 --- a/recipe/fully_async_policy/fully_async_trainer.py +++ b/recipe/fully_async_policy/fully_async_trainer.py @@ -169,13 +169,13 @@ def _get_samples_from_queue(self) -> tuple[None, None] | tuple[int, Any]: ) queue_samples = [ray.cloudpickle.loads(x) for x in queue_samples] - print(queue_samples) + # print(queue_samples) # Assemble batch - now working directly with RolloutSample objects if self.config.trainer.balance_batch: batch = assemble_batch_from_rollout_samples(queue_samples, self.tokenizer, self.config, self._balance_batch) else: batch = assemble_batch_from_rollout_samples(queue_samples, self.tokenizer, self.config, None) - print(f" _assemble_gen_batch_output_from_queue_samples {batch}") + # print(f" _assemble_gen_batch_output_from_queue_samples {batch}") return 0, batch def _create_actor_rollout_classes(self): diff --git a/tests/special_e2e/run_fully_async_policy.sh b/tests/special_e2e/run_fully_async_policy.sh index a938499a86b..c48f7b7507c 100644 --- a/tests/special_e2e/run_fully_async_policy.sh +++ b/tests/special_e2e/run_fully_async_policy.sh @@ -44,9 +44,9 @@ loss_agg_mode="token-mean" train_prompt_bsz=0 gen_prompt_bsz=1 n_resp_per_prompt=3 -train_prompt_mini_bsz=1 +train_prompt_mini_bsz=32 -total_rollout_steps=50 +total_rollout_steps=5000 # Temperature parameters temperature=1.0 @@ -60,7 +60,7 @@ n_gpus_rollout=2 n_gpus_training=$((NUM_GPUS - n_gpus_rollout)) # Async training specific configurations -staleness_threshold=3 +staleness_threshold=30000 exp_name="$(basename "${MODEL_ID,,}")-fully-async-policy-${ACTOR_STRATEGY}-minimal" From 25740b20d116bdfcab825a6045c104bbb9d84f0e Mon Sep 17 00:00:00 2001 From: ArronHZG Date: Mon, 18 Aug 2025 12:55:35 +0800 Subject: [PATCH 056/182] stop system run --- .../fully_async_rollouter.py | 37 ++++++++++------ .../fully_async_policy/fully_async_trainer.py | 4 +- recipe/fully_async_policy/message_queue.py | 44 ------------------- tests/special_e2e/run_fully_async_policy.sh | 2 +- 4 files changed, 26 insertions(+), 61 deletions(-) diff --git a/recipe/fully_async_policy/fully_async_rollouter.py b/recipe/fully_async_policy/fully_async_rollouter.py index 939f7a45b93..1e6101a6f5d 100644 --- a/recipe/fully_async_policy/fully_async_rollouter.py +++ b/recipe/fully_async_policy/fully_async_rollouter.py @@ -38,16 +38,16 @@ class FullyAsyncRollouter(RayPPOTrainer): """ def __init__( - self, - config, - tokenizer, - role_worker_mapping: dict[Role, WorkerType], - resource_pool_manager: ResourcePoolManager, - ray_worker_group_cls: RayWorkerGroup = RayWorkerGroup, - processor=None, - reward_fn=None, - val_reward_fn=None, - device_name=None, + self, + config, + tokenizer, + role_worker_mapping: dict[Role, WorkerType], + resource_pool_manager: ResourcePoolManager, + ray_worker_group_cls: RayWorkerGroup = RayWorkerGroup, + processor=None, + reward_fn=None, + val_reward_fn=None, + device_name=None, ): # Store the tokenizer for text processing self.tokenizer = tokenizer @@ -115,6 +115,7 @@ def __init__( # Concurrency control self.paused = False + self.running = True # Initialize async locks directly self.lock = asyncio.Lock() @@ -279,9 +280,9 @@ async def _processor_worker(self): async with self.lock: if await self._should_pause_generation(): - # 等待已提交的任务结束 + print("等待已提交的任务结束") await asyncio.gather(*self.active_tasks, return_exceptions=True) - self.active_tasks = set() + self.active_tasks.clear() self.paused = True while self.paused: @@ -293,6 +294,7 @@ async def _processor_worker(self): # 等待所有活动任务完成 if self.active_tasks: await asyncio.gather(*self.active_tasks, return_exceptions=True) + self.active_tasks.clear() break # 检查并发数是否超限 @@ -393,7 +395,6 @@ async def _streaming_generation_main(self): self._init_async_rollout_manager() # 启动流式处理循环 - """流式样本生成主循环 - 优化版本,确保先完成的样本优先进入队列""" print(f"[FullyAsyncRollouter] 启动流式处理模式,最大并发样本数: {self.max_concurrent_samples}") # 初始化异步队列 @@ -439,6 +440,9 @@ async def _streaming_generation_main(self): param_version=self.current_param_version, ) + async with self.lock: + self.running = False + async def fit(self): """ Start the async rollouter - entry point that sets up and runs async tasks @@ -452,7 +456,9 @@ async def fit(self): raise ValueError("param_synchronizer client not set. Call set_parameter_synchronizer() first.") # 设置运行状态 - self.paused = False + async with self.lock: + self.paused = False + self.running = True # 创建主要的异步任务 generation_task = asyncio.create_task(self._streaming_generation_main()) @@ -486,6 +492,9 @@ async def _async_monitor_loop(self): check_interval = 5.0 while True: + async with self.lock: + if not self.running: + break await asyncio.sleep(check_interval) # 定期打印统计信息 current_time = time.time() diff --git a/recipe/fully_async_policy/fully_async_trainer.py b/recipe/fully_async_policy/fully_async_trainer.py index ce8735bd8cc..784a3318166 100644 --- a/recipe/fully_async_policy/fully_async_trainer.py +++ b/recipe/fully_async_policy/fully_async_trainer.py @@ -146,8 +146,8 @@ def _get_samples_from_queue(self) -> tuple[None, None] | tuple[int, Any]: if sample is None: # 检测到结束信号(None),立即退出 - logger.info( - f"Detected termination signal (None), stopping sample collection. " + print( + f"[FullyAsyncTrainer] Detected termination signal (None), stopping sample collection. " f"Collected {len(queue_samples)}/{self.required_samples} samples" ) break diff --git a/recipe/fully_async_policy/message_queue.py b/recipe/fully_async_policy/message_queue.py index b2761f95749..fc1c133412e 100644 --- a/recipe/fully_async_policy/message_queue.py +++ b/recipe/fully_async_policy/message_queue.py @@ -96,40 +96,6 @@ async def put_sample(self, sample: Any, param_version: int) -> bool: return True - async def get_samples(self, min_batch_count: int = 1) -> tuple[list[Any], int]: - """ - Get batch samples from the queue, wait until enough samples are available - - Args: - min_batch_count: Get samples at once when sample count meets min_batch - - Returns: - List[Any]: List of retrieved samples - """ - async with self._lock: - while len(self.queue) < min_batch_count and self.running: - print(f"[MessageQueue] consumer_condition {len(self.queue)}") - if len(self.queue) > 0 and self.queue[-1] is None: - return [], len(self.queue) - await self._consumer_condition.wait() - - # If queue is closed and doesn't have enough samples, return empty list - if not self.running and len(self.queue) < min_batch_count: - return [], len(self.queue) - - # Get specified number of samples - batch_count = min(min_batch_count, len(self.queue)) - samples = [] - for _ in range(batch_count): - if self.queue: - data = self.queue.popleft() - if data is None: - return [], len(self.queue) - else: - samples.append(data) - - self.total_consumed += len(samples) - return samples, len(self.queue) async def get_sample(self) -> Any | None: """ @@ -140,7 +106,6 @@ async def get_sample(self) -> Any | None: """ async with self._lock: while len(self.queue) == 0 and self.running: - print(f"[MessageQueue] consumer_condition {len(self.queue)}") await self._consumer_condition.wait() # If queue is closed and empty, return None @@ -236,11 +201,6 @@ async def put_sample(self, sample: Any, param_version: int) -> bool: future = self.queue_actor.put_sample.remote(sample, param_version) return await asyncio.wrap_future(future.future()) - async def get_samples(self, min_batch_count: int = 1) -> tuple[list[Any], int]: - """Get batch from queue, wait until enough samples are available (async)""" - future = self.queue_actor.get_samples.remote(min_batch_count) - return await asyncio.wrap_future(future.future()) - async def get_sample(self) -> Any | None: """Get single sample from queue, wait until one is available (async)""" future = self.queue_actor.get_sample.remote() @@ -281,10 +241,6 @@ def put_sample_sync(self, sample: Any, param_version: int) -> bool: """Put batch into queue (sync - deprecated, use put_sample instead)""" return ray.get(self.queue_actor.put_sample.remote(sample, param_version)) - def get_samples_sync(self, min_batch_count: int = 1) -> tuple[list[Any], int]: - """Get batch from queue (sync - deprecated, use get_samples instead)""" - return ray.get(self.queue_actor.get_samples.remote(min_batch_count)) - def get_sample_sync(self) -> Any | None: """Get single sample from queue (sync - deprecated, use get_sample instead)""" return ray.get(self.queue_actor.get_sample.remote()) diff --git a/tests/special_e2e/run_fully_async_policy.sh b/tests/special_e2e/run_fully_async_policy.sh index c48f7b7507c..7674fcd08cd 100644 --- a/tests/special_e2e/run_fully_async_policy.sh +++ b/tests/special_e2e/run_fully_async_policy.sh @@ -44,7 +44,7 @@ loss_agg_mode="token-mean" train_prompt_bsz=0 gen_prompt_bsz=1 n_resp_per_prompt=3 -train_prompt_mini_bsz=32 +train_prompt_mini_bsz=256 total_rollout_steps=5000 From 737a8ce967bddfea10e48445938b1424f08b9e51 Mon Sep 17 00:00:00 2001 From: ArronHZG Date: Mon, 18 Aug 2025 15:23:54 +0800 Subject: [PATCH 057/182] system run suceess trigger_parameter_sync_step --- .../config/fully_async_ppo_trainer.yaml | 1 + .../fully_async_rollouter.py | 66 +++++++++++-------- .../fully_async_policy/fully_async_trainer.py | 65 +++++++++++------- recipe/fully_async_policy/message_queue.py | 16 ++--- recipe/fully_async_policy/param_sync.py | 2 +- tests/special_e2e/run_fully_async_policy.sh | 6 +- 6 files changed, 90 insertions(+), 66 deletions(-) diff --git a/recipe/fully_async_policy/config/fully_async_ppo_trainer.yaml b/recipe/fully_async_policy/config/fully_async_ppo_trainer.yaml index 665f7a8be89..a1dbaa7a79b 100644 --- a/recipe/fully_async_policy/config/fully_async_ppo_trainer.yaml +++ b/recipe/fully_async_policy/config/fully_async_ppo_trainer.yaml @@ -11,6 +11,7 @@ defaults: async_training: # 新鲜度控制 (Freshness Control) staleness_threshold: 3 # 样本新鲜度阈值 + trigger_parameter_sync_step: 10 # Rollout配置 rollout: diff --git a/recipe/fully_async_policy/fully_async_rollouter.py b/recipe/fully_async_policy/fully_async_rollouter.py index 1e6101a6f5d..62ce3c24347 100644 --- a/recipe/fully_async_policy/fully_async_rollouter.py +++ b/recipe/fully_async_policy/fully_async_rollouter.py @@ -134,7 +134,8 @@ def __init__( self.required_samples = calculate_one_step_size( self.minimal_bsz, config.actor_rollout_ref.actor.ppo_mini_batch_size ) - self.max_required_samples = self.required_samples * (self.staleness_threshold + 1) + self.max_required_samples = self.required_samples * ( + self.staleness_threshold + 1) * config.async_training.trigger_parameter_sync_step print( f"[FullyAsyncRollouter] required_samples : {self.required_samples} " f"max_required_samples: {self.max_required_samples}" @@ -153,6 +154,11 @@ def __init__( self.max_queue_size = self.max_required_samples * 10 # x 10 avoid deadlock print(f"[FullyAsyncRollouter] {self.max_queue_size}") + # 初始化异步队列 + self.pending_queue = asyncio.Queue(maxsize=100) + self.active_tasks = set() + self.result_queue = asyncio.Queue() + async def set_message_queue_client(self, message_queue_client: MessageQueueClient): """Set message queue client""" async with self.lock: @@ -284,35 +290,37 @@ async def _processor_worker(self): await asyncio.gather(*self.active_tasks, return_exceptions=True) self.active_tasks.clear() self.paused = True - while self.paused: await self.condition.wait() # 获取待处理的部分 RolloutSample - if partial_rollout_sample == "DONE": - print("收到结束信号,等待剩余任务完成...") - # 等待所有活动任务完成 - if self.active_tasks: - await asyncio.gather(*self.active_tasks, return_exceptions=True) - self.active_tasks.clear() - break + async with self.lock: + if partial_rollout_sample == "DONE": + print("收到结束信号,等待剩余任务完成...") + # 等待所有活动任务完成 + if self.active_tasks: + await asyncio.gather(*self.active_tasks, return_exceptions=True) + self.active_tasks.clear() + break # 检查并发数是否超限 - while len(self.active_tasks) >= self.max_concurrent_samples: - # 等待至少一个任务完成 - done_tasks, self.active_tasks = await asyncio.wait( - self.active_tasks, return_when=asyncio.FIRST_COMPLETED - ) - # 清理已完成的任务 - for task in done_tasks: - await task + async with self.lock: + while len(self.active_tasks) >= self.max_concurrent_samples: + # 等待至少一个任务完成 + done_tasks, self.active_tasks = await asyncio.wait( + self.active_tasks, return_when=asyncio.FIRST_COMPLETED + ) + # 清理已完成的任务 + for task in done_tasks: + await task # 立即提交单个样本处理 - task = asyncio.create_task( - self._process_single_sample_streaming(partial_rollout_sample), - name=f"process_{partial_rollout_sample.sample_id}", - ) - self.active_tasks.add(task) + async with self.lock: + task = asyncio.create_task( + self._process_single_sample_streaming(partial_rollout_sample), + name=f"process_{partial_rollout_sample.sample_id}", + ) + self.active_tasks.add(task) # 标记队列任务完成 self.pending_queue.task_done() @@ -350,13 +358,12 @@ async def _consumer_worker(self): sample=ray.cloudpickle.dumps(rollout_sample), param_version=rollout_sample.param_version, ) - if success: self.total_generated_samples += 1 else: self.dropped_stale_samples += 1 - print(f"[FullyAsyncRollouter] submit {rollout_sample.sample_id} {'success' if success else 'error'}") + # print(f"[FullyAsyncRollouter] submit {rollout_sample.sample_id} {'success' if success else 'error'}") # 标记结果队列任务完成 self.result_queue.task_done() @@ -397,11 +404,6 @@ async def _streaming_generation_main(self): # 启动流式处理循环 print(f"[FullyAsyncRollouter] 启动流式处理模式,最大并发样本数: {self.max_concurrent_samples}") - # 初始化异步队列 - self.pending_queue = asyncio.Queue(maxsize=100) - self.active_tasks = set() - self.result_queue = asyncio.Queue() - # 启动流式处理协程和消费者协程 self.feed_task = asyncio.create_task(self._feed_samples()) self.processor_task = asyncio.create_task(self._processor_worker()) @@ -507,6 +509,8 @@ async def _async_monitor_loop(self): await self.resume() async def _should_pause_generation(self) -> bool: + if self.paused: + return True """Determine whether the build should be paused""" queue_stats = self.message_queue_client.get_statistics_sync() queue_size = queue_stats["queue_size"] @@ -543,6 +547,10 @@ async def pause(self): print("[FullyAsyncRollouter] pause") async with self.lock: self.paused = True + if self.active_tasks: + await asyncio.gather(*self.active_tasks, return_exceptions=True) + self.active_tasks.clear() + print("[FullyAsyncRollouter] All active tasks completed") async def resume(self): """resume rollout diff --git a/recipe/fully_async_policy/fully_async_trainer.py b/recipe/fully_async_policy/fully_async_trainer.py index 784a3318166..e84cecee387 100644 --- a/recipe/fully_async_policy/fully_async_trainer.py +++ b/recipe/fully_async_policy/fully_async_trainer.py @@ -15,6 +15,7 @@ import logging import time import warnings +from pprint import pprint from typing import Any import numpy as np @@ -49,16 +50,16 @@ class FullyAsyncTrainer(RayPPOTrainer): """ def __init__( - self, - config, - tokenizer, - role_worker_mapping: dict[Role, WorkerType], - resource_pool_manager: ResourcePoolManager, - ray_worker_group_cls: RayWorkerGroup = RayWorkerGroup, - processor=None, - reward_fn=None, - val_reward_fn=None, - device_name=None, + self, + config, + tokenizer, + role_worker_mapping: dict[Role, WorkerType], + resource_pool_manager: ResourcePoolManager, + ray_worker_group_cls: RayWorkerGroup = RayWorkerGroup, + processor=None, + reward_fn=None, + val_reward_fn=None, + device_name=None, ): # Store the tokenizer for text processing self.tokenizer = tokenizer @@ -107,6 +108,9 @@ def __init__( self.stale_samples_processed = 0 self.current_param_version = 0 + self.local_trigger_step = 1 + self.trigger_parameter_sync_step = config.async_training.trigger_parameter_sync_step + self.required_samples = calculate_one_step_size( self.minimal_bsz, config.actor_rollout_ref.actor.ppo_mini_batch_size ) @@ -302,11 +306,36 @@ def fit(self): # self._collect_metrics(batch, epoch, metrics, timing_raw) + pprint(metrics) + # Trigger parameter synchronization after training step - # self._trigger_parameter_sync_after_step() - print(f"[FullyAsyncTrainer] global_steps: {self.global_steps}") + print(f"[FullyAsyncTrainer] global_steps: {self.global_steps}" + f"[FullyAsyncTrainer] _trigger_parameter_sync_after_step {self.local_trigger_step} {self.trigger_parameter_sync_step}") + self._trigger_parameter_sync_after_step() self.global_steps += 1 + def _trigger_parameter_sync_after_step(self): + """ + Trigger parameter synchronization after training step + This ensures rollouter always uses the latest trained parameters + """ + print("[FullyAsyncTrainer] Trigger parameter synchronization after training step") + if self.local_trigger_step >= self.trigger_parameter_sync_step: + print(f"[FullyAsyncTrainer] Trigger start run") + self.local_trigger_step = 1 + print(f"[FullyAsyncTrainer] {self.current_param_version}") + self.current_param_version = self.current_param_version + 1 + print( + f"[FullyAsyncTrainer] Triggering parameter sync after " + f"training step {self.global_steps}, version: {self.current_param_version}" + ) + ray.get(self.param_synchronizer.sync_weights.remote(self.current_param_version)) + return + else: + print(f"[FullyAsyncTrainer] Trigger {self.local_trigger_step}") + self.local_trigger_step += 1 + return + def get_statistics(self) -> dict: """Get training statistics""" queue_stats = self.message_queue_client.get_statistics_sync() if self.message_queue_client else {} @@ -321,18 +350,6 @@ def get_statistics(self) -> dict: "queue_dropped_samples": queue_stats.get("dropped_samples", 0), } - def _trigger_parameter_sync_after_step(self): - """ - Trigger parameter synchronization after training step - This ensures rollouter always uses the latest trained parameters - """ - self.current_param_version = self.current_param_version + 1 - print( - f"[FullyAsyncTrainer] Triggering parameter sync after " - f"training step {self.global_steps}, version: {self.current_param_version}" - ) - ray.get(self.param_synchronizer.sync_weights.remote(self.current_param_version)) - def _compute_sample_freshness_metrics(self, rollout_samples: list[RolloutSample]) -> dict: """ Compute sample freshness metrics diff --git a/recipe/fully_async_policy/message_queue.py b/recipe/fully_async_policy/message_queue.py index fc1c133412e..2a12cb21c90 100644 --- a/recipe/fully_async_policy/message_queue.py +++ b/recipe/fully_async_policy/message_queue.py @@ -77,7 +77,7 @@ async def put_sample(self, sample: Any, param_version: int) -> bool: staleness = self.current_param_version - param_version if staleness > self.staleness_threshold: self.dropped_samples += 1 - logger.debug(f"Dropped stale sample: staleness={staleness}, threshold={self.staleness_threshold}") + print(f"Dropped stale sample: staleness={staleness}, threshold={self.staleness_threshold}") return False # If queue is full, remove the oldest sample (rarely happens) @@ -92,11 +92,10 @@ async def put_sample(self, sample: Any, param_version: int) -> bool: self._consumer_condition.notify_all() if self.total_produced % 100 == 0: - logger.debug(f"MessageQueue stats: produced={self.total_produced}, queue_size={len(self.queue)}") + print(f"MessageQueue stats: produced={self.total_produced}, queue_size={len(self.queue)}") return True - async def get_sample(self) -> Any | None: """ Get a single sample from the queue, wait until one is available @@ -122,7 +121,7 @@ async def update_param_version(self, version: int): async with self._lock: old_version = self.current_param_version self.current_param_version = version - logger.debug(f"Parameter version updated from {old_version} to {version}") + print(f"Parameter version updated from {old_version} to {version}") async def get_queue_size(self) -> int: """Get current queue length""" @@ -206,11 +205,6 @@ async def get_sample(self) -> Any | None: future = self.queue_actor.get_sample.remote() return await asyncio.wrap_future(future.future()) - async def update_param_version(self, version: int): - """Update parameter version (async)""" - future = self.queue_actor.update_param_version.remote(version) - await asyncio.wrap_future(future.future()) - async def get_queue_size(self) -> int: """Get queue size (async)""" future = self.queue_actor.get_queue_size.remote() @@ -248,3 +242,7 @@ def get_sample_sync(self) -> Any | None: def get_statistics_sync(self) -> dict[str, Any]: """Get statistics (sync - deprecated, use get_statistics instead)""" return ray.get(self.queue_actor.get_statistics.remote()) + + def update_param_version_sync(self, version: int): + """Update parameter version (async)""" + return ray.get(self.queue_actor.update_param_version.remote(version)) diff --git a/recipe/fully_async_policy/param_sync.py b/recipe/fully_async_policy/param_sync.py index 7e40e755a12..53ced11956c 100644 --- a/recipe/fully_async_policy/param_sync.py +++ b/recipe/fully_async_policy/param_sync.py @@ -80,7 +80,7 @@ def sync_weights(self, version): ray.get(self.rollouter.pause.remote()) # Update MQ version - self.mq_client.update_param_version(version) + self.mq_client.update_param_version_sync(version) # sync weights self.actor_wg.sync_rollout_weights() diff --git a/tests/special_e2e/run_fully_async_policy.sh b/tests/special_e2e/run_fully_async_policy.sh index 7674fcd08cd..1b2df475598 100644 --- a/tests/special_e2e/run_fully_async_policy.sh +++ b/tests/special_e2e/run_fully_async_policy.sh @@ -44,7 +44,7 @@ loss_agg_mode="token-mean" train_prompt_bsz=0 gen_prompt_bsz=1 n_resp_per_prompt=3 -train_prompt_mini_bsz=256 +train_prompt_mini_bsz=32 total_rollout_steps=5000 @@ -56,11 +56,11 @@ val_top_p=0.7 # Fully async specific parameters # Allocate 2 GPUs for rollout, remaining for training -n_gpus_rollout=2 +n_gpus_rollout=4 n_gpus_training=$((NUM_GPUS - n_gpus_rollout)) # Async training specific configurations -staleness_threshold=30000 +staleness_threshold=0 exp_name="$(basename "${MODEL_ID,,}")-fully-async-policy-${ACTOR_STRATEGY}-minimal" From defd61f369ad24acdde6b91c9d72b7e6637ce9ff Mon Sep 17 00:00:00 2001 From: ArronHZG Date: Mon, 18 Aug 2025 15:24:09 +0800 Subject: [PATCH 058/182] system run suceess trigger_parameter_sync_step --- recipe/fully_async_policy/fully_async_trainer.py | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/recipe/fully_async_policy/fully_async_trainer.py b/recipe/fully_async_policy/fully_async_trainer.py index e84cecee387..3021c536eca 100644 --- a/recipe/fully_async_policy/fully_async_trainer.py +++ b/recipe/fully_async_policy/fully_async_trainer.py @@ -309,8 +309,9 @@ def fit(self): pprint(metrics) # Trigger parameter synchronization after training step - print(f"[FullyAsyncTrainer] global_steps: {self.global_steps}" - f"[FullyAsyncTrainer] _trigger_parameter_sync_after_step {self.local_trigger_step} {self.trigger_parameter_sync_step}") + print(f"[FullyAsyncTrainer] global_steps: {self.global_steps} " + f"local_trigger_step: {self.local_trigger_step} " + f"trigger_parameter_sync_step: {self.trigger_parameter_sync_step}") self._trigger_parameter_sync_after_step() self.global_steps += 1 @@ -319,11 +320,8 @@ def _trigger_parameter_sync_after_step(self): Trigger parameter synchronization after training step This ensures rollouter always uses the latest trained parameters """ - print("[FullyAsyncTrainer] Trigger parameter synchronization after training step") if self.local_trigger_step >= self.trigger_parameter_sync_step: - print(f"[FullyAsyncTrainer] Trigger start run") self.local_trigger_step = 1 - print(f"[FullyAsyncTrainer] {self.current_param_version}") self.current_param_version = self.current_param_version + 1 print( f"[FullyAsyncTrainer] Triggering parameter sync after " From e86625b459dfa9d62560c90a41b471ca24083c3b Mon Sep 17 00:00:00 2001 From: ArronHZG Date: Mon, 18 Aug 2025 18:04:29 +0800 Subject: [PATCH 059/182] All active tasks completed --- .../config/fully_async_ppo_trainer.yaml | 2 +- .../fully_async_rollouter.py | 61 ++++++++++++------- tests/special_e2e/run_fully_async_policy.sh | 4 +- 3 files changed, 42 insertions(+), 25 deletions(-) diff --git a/recipe/fully_async_policy/config/fully_async_ppo_trainer.yaml b/recipe/fully_async_policy/config/fully_async_ppo_trainer.yaml index a1dbaa7a79b..8ccb6d36b71 100644 --- a/recipe/fully_async_policy/config/fully_async_ppo_trainer.yaml +++ b/recipe/fully_async_policy/config/fully_async_ppo_trainer.yaml @@ -11,7 +11,7 @@ defaults: async_training: # 新鲜度控制 (Freshness Control) staleness_threshold: 3 # 样本新鲜度阈值 - trigger_parameter_sync_step: 10 + trigger_parameter_sync_step: 1 # >=1 # Rollout配置 rollout: diff --git a/recipe/fully_async_policy/fully_async_rollouter.py b/recipe/fully_async_policy/fully_async_rollouter.py index 62ce3c24347..e501a6ab142 100644 --- a/recipe/fully_async_policy/fully_async_rollouter.py +++ b/recipe/fully_async_policy/fully_async_rollouter.py @@ -135,11 +135,9 @@ def __init__( self.minimal_bsz, config.actor_rollout_ref.actor.ppo_mini_batch_size ) self.max_required_samples = self.required_samples * ( - self.staleness_threshold + 1) * config.async_training.trigger_parameter_sync_step - print( - f"[FullyAsyncRollouter] required_samples : {self.required_samples} " - f"max_required_samples: {self.max_required_samples}" - ) + self.staleness_threshold + 1) * config.async_training.trigger_parameter_sync_step + print(f"[FullyAsyncRollouter] required_samples : {self.required_samples} " + f"max_required_samples: {self.max_required_samples}") # 单次最多扔一次迭代需要的样本 self.max_concurrent_samples = self.required_samples @@ -183,7 +181,8 @@ async def update_param_version(self, version: int): self.current_param_version = version # every time param change, reset staleness_samples self.staleness_samples = 0 - print(f"[FullyAsyncRollouter] Parameter version updated from {old_version} to {version}") + print(f"[FullyAsyncRollouter][Public][update_param_version] " + f"Parameter version updated from {old_version} to {version}") def _validate_config(self): # Validate asynchronous training configuration @@ -263,7 +262,8 @@ async def _feed_samples(self): # 检查是否到达最后一步 if self.global_steps >= self.total_rollout_steps: print( - f"[FullyAsyncRollouter] 达到最大步数,停止添加新样本 " + f"[FullyAsyncRollouter][Feed] " + f"达到最大步数,停止添加新样本 " f"{self.global_steps} >= {self.total_rollout_steps}" ) should_stop = True # 设置停止标志 @@ -275,7 +275,8 @@ async def _feed_samples(self): # 发送结束信号 await self.pending_queue.put("DONE") - print(f"[FullyAsyncRollouter] 样本添加完成,总共添加了 {self.global_steps} 个步骤的样本") + print(f"[FullyAsyncRollouter][Feed] " + f"样本添加完成,总共添加了 {self.global_steps} 个步骤的样本") async def _processor_worker(self): """流式处理工作协程 - 逐个样本立即提交处理,不等待批次""" @@ -286,9 +287,11 @@ async def _processor_worker(self): async with self.lock: if await self._should_pause_generation(): - print("等待已提交的任务结束") - await asyncio.gather(*self.active_tasks, return_exceptions=True) - self.active_tasks.clear() + print(f"[FullyAsyncRollouter][Processor] 等待已提交的任务结束 " + f"{[t.get_name() for t in self.active_tasks]}") + if self.active_tasks: + await asyncio.gather(*self.active_tasks, return_exceptions=True) + self.active_tasks.clear() self.paused = True while self.paused: await self.condition.wait() @@ -296,7 +299,7 @@ async def _processor_worker(self): # 获取待处理的部分 RolloutSample async with self.lock: if partial_rollout_sample == "DONE": - print("收到结束信号,等待剩余任务完成...") + print(f"[FullyAsyncRollouter][Processor] 收到结束信号,等待剩余任务完成...") # 等待所有活动任务完成 if self.active_tasks: await asyncio.gather(*self.active_tasks, return_exceptions=True) @@ -316,6 +319,9 @@ async def _processor_worker(self): # 立即提交单个样本处理 async with self.lock: + # pause结束后,获取到锁,还需要判断是否是暂停阶段,否则继续等待 + while self.paused: + await self.condition.wait() task = asyncio.create_task( self._process_single_sample_streaming(partial_rollout_sample), name=f"process_{partial_rollout_sample.sample_id}", @@ -346,7 +352,7 @@ async def _process_single_sample_streaming(self, partial_rollout_sample): if processing_time > self.max_processing_time: self.max_processing_time = processing_time - print(f"[FullyAsyncRollouter] process {partial_rollout_sample.sample_id} cost {processing_time:.2f}s") + print(f"[FullyAsyncRollouter] rollout {partial_rollout_sample.sample_id} cost {processing_time:.2f}s") async def _consumer_worker(self): """消费者协程,负责从结果队列获取处理结果并放入消息队列""" @@ -502,11 +508,14 @@ async def _async_monitor_loop(self): current_time = time.time() if current_time - last_stats_time >= stats_interval: stats = await self.get_statistics() - print(f"[FullyAsyncRollouter] statistics {stats}") + print(f"[FullyAsyncRollouter][MonitorLoop] {stats}") last_stats_time = current_time if not await self._should_pause_generation(): - await self.resume() + async with self.lock: + print(f"[FullyAsyncRollouter][MonitorLoop] trigger resume") + self.paused = False + self.condition.notify_all() async def _should_pause_generation(self) -> bool: if self.paused: @@ -520,20 +529,23 @@ async def _should_pause_generation(self) -> bool: if version_diff > self.staleness_threshold: print( - "[FullyAsyncRollouter] " - f"Should pause due to version_diff > self.staleness_threshold: " + "[FullyAsyncRollouter][ShouldPause] " + f"due to version_diff > self.staleness_threshold: " f"rollout_version={self.current_param_version}, " f"trainer_version={current_trainer_version}, diff={version_diff}" ) return True if queue_size >= self.max_queue_size: - print(f"[FullyAsyncRollouter] Should pause due to full queue: size={queue_size}, max={self.max_queue_size}") + print( + "[FullyAsyncRollouter][ShouldPause] " + f" due to full queue: size={queue_size}, max={self.max_queue_size}") return True if self.staleness_samples > self.max_required_samples: print( - f"[FullyAsyncRollouter] Should pause due to " + "[FullyAsyncRollouter][ShouldPause] " + f"due to " f"staleness_samples {self.staleness_samples} > max_required_samples {self.max_required_samples} " ) return True @@ -544,19 +556,24 @@ async def pause(self): """pause rollout TODO integrated Partial Rollout """ - print("[FullyAsyncRollouter] pause") + print("[FullyAsyncRollouter][Public] pause") async with self.lock: self.paused = True if self.active_tasks: + print(f"[FullyAsyncRollouter][Pause] " + f"{[t.get_name() for t in self.active_tasks]}") await asyncio.gather(*self.active_tasks, return_exceptions=True) self.active_tasks.clear() - print("[FullyAsyncRollouter] All active tasks completed") + print("[FullyAsyncRollouter][Pause] All active tasks completed") + + # print("[FullyAsyncRollouter][Public] pause sleep 10") + # await asyncio.sleep(10) async def resume(self): """resume rollout TODO integrated Partial Rollout """ - print("[FullyAsyncRollouter] resume") + print("[FullyAsyncRollouter][Public] resume") async with self.lock: self.paused = False self.condition.notify_all() diff --git a/tests/special_e2e/run_fully_async_policy.sh b/tests/special_e2e/run_fully_async_policy.sh index 1b2df475598..ebcf07b43f7 100644 --- a/tests/special_e2e/run_fully_async_policy.sh +++ b/tests/special_e2e/run_fully_async_policy.sh @@ -46,7 +46,7 @@ gen_prompt_bsz=1 n_resp_per_prompt=3 train_prompt_mini_bsz=32 -total_rollout_steps=5000 +total_rollout_steps=50000 # Temperature parameters temperature=1.0 @@ -60,7 +60,7 @@ n_gpus_rollout=4 n_gpus_training=$((NUM_GPUS - n_gpus_rollout)) # Async training specific configurations -staleness_threshold=0 +staleness_threshold=10 exp_name="$(basename "${MODEL_ID,,}")-fully-async-policy-${ACTOR_STRATEGY}-minimal" From ed4d5720885d59df617c363ff26f26c1ddfe595f Mon Sep 17 00:00:00 2001 From: ArronHZG Date: Mon, 18 Aug 2025 18:05:05 +0800 Subject: [PATCH 060/182] pause submit task --- recipe/fully_async_policy/fully_async_rollouter.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/recipe/fully_async_policy/fully_async_rollouter.py b/recipe/fully_async_policy/fully_async_rollouter.py index e501a6ab142..18dd6a5319d 100644 --- a/recipe/fully_async_policy/fully_async_rollouter.py +++ b/recipe/fully_async_policy/fully_async_rollouter.py @@ -560,11 +560,11 @@ async def pause(self): async with self.lock: self.paused = True if self.active_tasks: - print(f"[FullyAsyncRollouter][Pause] " + print(f"[FullyAsyncRollouter][Public][Pause] " f"{[t.get_name() for t in self.active_tasks]}") await asyncio.gather(*self.active_tasks, return_exceptions=True) self.active_tasks.clear() - print("[FullyAsyncRollouter][Pause] All active tasks completed") + print("[FullyAsyncRollouter][Public][Pause] All active tasks completed") # print("[FullyAsyncRollouter][Public] pause sleep 10") # await asyncio.sleep(10) From bd99e16411e7f86977f061825bf328ad6c0d00d6 Mon Sep 17 00:00:00 2001 From: ArronHZG Date: Mon, 18 Aug 2025 19:37:48 +0800 Subject: [PATCH 061/182] steam rollout --- .../config/fully_async_ppo_trainer.yaml | 2 +- .../dapo_7b_math_fsdp2_4_12.sh | 40 +++++++++++++------ .../fully_async_rollouter.py | 17 ++++---- .../fully_async_policy/fully_async_trainer.py | 7 +--- tests/special_e2e/run_fully_async_policy.sh | 15 +++---- 5 files changed, 44 insertions(+), 37 deletions(-) diff --git a/recipe/fully_async_policy/config/fully_async_ppo_trainer.yaml b/recipe/fully_async_policy/config/fully_async_ppo_trainer.yaml index 8ccb6d36b71..9d0a8c67383 100644 --- a/recipe/fully_async_policy/config/fully_async_ppo_trainer.yaml +++ b/recipe/fully_async_policy/config/fully_async_ppo_trainer.yaml @@ -11,7 +11,7 @@ defaults: async_training: # 新鲜度控制 (Freshness Control) staleness_threshold: 3 # 样本新鲜度阈值 - trigger_parameter_sync_step: 1 # >=1 + trigger_parameter_sync_step: 10 # >=1 train 每次训练一个batch, 迭代多少次后触发更新 # Rollout配置 rollout: diff --git a/recipe/fully_async_policy/dapo_7b_math_fsdp2_4_12.sh b/recipe/fully_async_policy/dapo_7b_math_fsdp2_4_12.sh index 5c2ac5e6017..86cd25affe2 100644 --- a/recipe/fully_async_policy/dapo_7b_math_fsdp2_4_12.sh +++ b/recipe/fully_async_policy/dapo_7b_math_fsdp2_4_12.sh @@ -21,7 +21,14 @@ CKPTS_DIR=./ckpts/${project_name}/${exp_name} TRAIN_FILE=/mnt/dolphinfs/hdd_pool/docker/user/hadoop-djst-algoplat/houzhenggang/data/dapo/dapo-math-17k.parquet TEST_FILE=/mnt/dolphinfs/hdd_pool/docker/user/hadoop-djst-algoplat/houzhenggang/data/dapo/aime-2024.parquet - +rollout_mode="async" +rollout_name="vllm" # sglang or vllm +if [ "$rollout_mode" = "async" ]; then + export VLLM_USE_V1=1 + return_raw_chat="True" +fi + +# Algorithm parameters adv_estimator=grpo use_kl_in_reward=False @@ -32,20 +39,16 @@ kl_loss_coef=0.0 clip_ratio_low=0.2 clip_ratio_high=0.28 +# Response length parameters max_prompt_length=$((1024 * 2)) max_response_length=$((1024 * 8)) enable_overlong_buffer=True overlong_buffer_len=$((1024 * 4)) overlong_penalty_factor=1.0 +# Training parameters loss_agg_mode="token-mean" -train_prompt_bsz=2 -gen_prompt_bsz=4 -n_resp_per_prompt=16 -train_prompt_mini_bsz=32 -train_sync_weight_steps=64 - # Algorithm temperature=1.0 top_p=1.0 @@ -62,14 +65,21 @@ gen_tp=1 sp_size=1 fsdp_size=2 -staleness_threshold=3 - NNODES=${NNODES:-1} NGPUS_PER_NODE=${NGPUS_PER_NODE:-8} +# Fully async specific parameters n_gpus_rollout=4 n_gpus_training=$((NGPUS_PER_NODE - n_gpus_rollout)) +train_prompt_bsz=0 +gen_prompt_bsz=1 +n_resp_per_prompt=16 +train_prompt_mini_bsz=32 +staleness_threshold=10 +total_rollout_steps=$(((512*16*100))) +trigger_parameter_sync_step=32 + /home/hadoop-djst-algoplat/miniconda3/bin/python -m recipe.fully_async_policy.fully_async_main \ data.train_files="${TRAIN_FILE}" \ data.val_files="${TEST_FILE}" \ @@ -79,6 +89,7 @@ n_gpus_training=$((NGPUS_PER_NODE - n_gpus_rollout)) data.max_response_length=${max_response_length} \ data.train_batch_size=${train_prompt_bsz} \ data.gen_batch_size=${gen_prompt_bsz} \ + data.return_raw_chat=${return_raw_chat} \ actor_rollout_ref.rollout.n=${n_resp_per_prompt} \ algorithm.adv_estimator=${adv_estimator} \ algorithm.use_kl_in_reward=${use_kl_in_reward} \ @@ -125,6 +136,8 @@ n_gpus_training=$((NGPUS_PER_NODE - n_gpus_rollout)) actor_rollout_ref.ref.fsdp_config.param_offload=${ref_offload} \ actor_rollout_ref.ref.ulysses_sequence_parallel_size=${sp_size} \ actor_rollout_ref.actor.fsdp_config.fsdp_size=${fsdp_size} \ + actor_rollout_ref.rollout.name=${rollout_name} \ + actor_rollout_ref.rollout.mode=${rollout_mode} \ reward_model.reward_manager=dapo \ +reward_model.reward_kwargs.overlong_buffer_cfg.enable=${enable_overlong_buffer} \ +reward_model.reward_kwargs.overlong_buffer_cfg.len=${overlong_buffer_len} \ @@ -135,7 +148,7 @@ n_gpus_training=$((NGPUS_PER_NODE - n_gpus_rollout)) trainer.project_name="${project_name}" \ trainer.experiment_name="${exp_name}" \ trainer.val_before_train=True \ - trainer.test_freq=10 \ + trainer.test_freq=-1 \ trainer.save_freq=-1 \ trainer.default_local_dir="${CKPTS_DIR}" \ trainer.resume_mode=auto \ @@ -143,6 +156,7 @@ n_gpus_training=$((NGPUS_PER_NODE - n_gpus_rollout)) trainer.n_gpus_per_node="${n_gpus_training}" \ rollout.nnodes="${NNODES}" \ rollout.n_gpus_per_node="${n_gpus_rollout}" \ - rollout.total_rollout_steps=100 \ - rollout.total_epochs=2 \ - async_training.staleness_threshold=${staleness_threshold} + rollout.total_rollout_steps="${total_rollout_steps}" \ + rollout.total_epochs=10 \ + async_training.staleness_threshold="${staleness_threshold}" \ + async_training.trigger_parameter_sync_step="${trigger_parameter_sync_step}" diff --git a/recipe/fully_async_policy/fully_async_rollouter.py b/recipe/fully_async_policy/fully_async_rollouter.py index 18dd6a5319d..85860d7c5e9 100644 --- a/recipe/fully_async_policy/fully_async_rollouter.py +++ b/recipe/fully_async_policy/fully_async_rollouter.py @@ -157,6 +157,9 @@ def __init__( self.active_tasks = set() self.result_queue = asyncio.Queue() + # 通过 pause 和 resume 控制 monitor_loop 中,是否进行 尝试恢复 操作 + self.monitor_loop_trigger = True + async def set_message_queue_client(self, message_queue_client: MessageQueueClient): """Set message queue client""" async with self.lock: @@ -287,8 +290,7 @@ async def _processor_worker(self): async with self.lock: if await self._should_pause_generation(): - print(f"[FullyAsyncRollouter][Processor] 等待已提交的任务结束 " - f"{[t.get_name() for t in self.active_tasks]}") + print(f"[FullyAsyncRollouter][Processor] 等待已提交的任务结束") if self.active_tasks: await asyncio.gather(*self.active_tasks, return_exceptions=True) self.active_tasks.clear() @@ -511,7 +513,8 @@ async def _async_monitor_loop(self): print(f"[FullyAsyncRollouter][MonitorLoop] {stats}") last_stats_time = current_time - if not await self._should_pause_generation(): + # pause 和 resume 直接,不进行恢复操作 + if self.monitor_loop_trigger and not await self._should_pause_generation(): async with self.lock: print(f"[FullyAsyncRollouter][MonitorLoop] trigger resume") self.paused = False @@ -560,14 +563,11 @@ async def pause(self): async with self.lock: self.paused = True if self.active_tasks: - print(f"[FullyAsyncRollouter][Public][Pause] " - f"{[t.get_name() for t in self.active_tasks]}") + print(f"[FullyAsyncRollouter][Public][Pause]") await asyncio.gather(*self.active_tasks, return_exceptions=True) self.active_tasks.clear() print("[FullyAsyncRollouter][Public][Pause] All active tasks completed") - - # print("[FullyAsyncRollouter][Public] pause sleep 10") - # await asyncio.sleep(10) + self.monitor_loop_trigger = False async def resume(self): """resume rollout @@ -577,6 +577,7 @@ async def resume(self): async with self.lock: self.paused = False self.condition.notify_all() + self.monitor_loop_trigger = True async def get_statistics(self) -> dict: queue_stats = self.message_queue_client.get_statistics_sync() diff --git a/recipe/fully_async_policy/fully_async_trainer.py b/recipe/fully_async_policy/fully_async_trainer.py index 3021c536eca..cc0c2378021 100644 --- a/recipe/fully_async_policy/fully_async_trainer.py +++ b/recipe/fully_async_policy/fully_async_trainer.py @@ -158,7 +158,7 @@ def _get_samples_from_queue(self) -> tuple[None, None] | tuple[int, Any]: queue_samples.append(sample) - if len(queue_samples) % 10 == 0 or len(queue_samples) >= self.required_samples: + if len(queue_samples) % 10 == 0: print(f"[FullyAsyncTrainer] Collected {len(queue_samples)}/{self.required_samples} samples") consumer_end = time.time() @@ -323,14 +323,9 @@ def _trigger_parameter_sync_after_step(self): if self.local_trigger_step >= self.trigger_parameter_sync_step: self.local_trigger_step = 1 self.current_param_version = self.current_param_version + 1 - print( - f"[FullyAsyncTrainer] Triggering parameter sync after " - f"training step {self.global_steps}, version: {self.current_param_version}" - ) ray.get(self.param_synchronizer.sync_weights.remote(self.current_param_version)) return else: - print(f"[FullyAsyncTrainer] Trigger {self.local_trigger_step}") self.local_trigger_step += 1 return diff --git a/tests/special_e2e/run_fully_async_policy.sh b/tests/special_e2e/run_fully_async_policy.sh index ebcf07b43f7..8e0b82ddefc 100644 --- a/tests/special_e2e/run_fully_async_policy.sh +++ b/tests/special_e2e/run_fully_async_policy.sh @@ -41,12 +41,6 @@ overlong_penalty_factor=1.0 # Training parameters loss_agg_mode="token-mean" -train_prompt_bsz=0 -gen_prompt_bsz=1 -n_resp_per_prompt=3 -train_prompt_mini_bsz=32 - -total_rollout_steps=50000 # Temperature parameters temperature=1.0 @@ -55,11 +49,14 @@ top_k=-1 val_top_p=0.7 # Fully async specific parameters -# Allocate 2 GPUs for rollout, remaining for training -n_gpus_rollout=4 +n_gpus_rollout=6 n_gpus_training=$((NUM_GPUS - n_gpus_rollout)) -# Async training specific configurations +train_prompt_bsz=0 +gen_prompt_bsz=1 +n_resp_per_prompt=3 +train_prompt_mini_bsz=32 +total_rollout_steps=50000 staleness_threshold=10 exp_name="$(basename "${MODEL_ID,,}")-fully-async-policy-${ACTOR_STRATEGY}-minimal" From c59055ca7e4ed7eabe46604669312533b25e5260 Mon Sep 17 00:00:00 2001 From: ArronHZG Date: Mon, 18 Aug 2025 19:37:57 +0800 Subject: [PATCH 062/182] steam rollout --- .../fully_async_rollouter.py | 44 ++++++++++--------- .../fully_async_policy/fully_async_trainer.py | 28 ++++++------ 2 files changed, 39 insertions(+), 33 deletions(-) diff --git a/recipe/fully_async_policy/fully_async_rollouter.py b/recipe/fully_async_policy/fully_async_rollouter.py index 85860d7c5e9..ac1253454d7 100644 --- a/recipe/fully_async_policy/fully_async_rollouter.py +++ b/recipe/fully_async_policy/fully_async_rollouter.py @@ -38,16 +38,16 @@ class FullyAsyncRollouter(RayPPOTrainer): """ def __init__( - self, - config, - tokenizer, - role_worker_mapping: dict[Role, WorkerType], - resource_pool_manager: ResourcePoolManager, - ray_worker_group_cls: RayWorkerGroup = RayWorkerGroup, - processor=None, - reward_fn=None, - val_reward_fn=None, - device_name=None, + self, + config, + tokenizer, + role_worker_mapping: dict[Role, WorkerType], + resource_pool_manager: ResourcePoolManager, + ray_worker_group_cls: RayWorkerGroup = RayWorkerGroup, + processor=None, + reward_fn=None, + val_reward_fn=None, + device_name=None, ): # Store the tokenizer for text processing self.tokenizer = tokenizer @@ -134,10 +134,13 @@ def __init__( self.required_samples = calculate_one_step_size( self.minimal_bsz, config.actor_rollout_ref.actor.ppo_mini_batch_size ) - self.max_required_samples = self.required_samples * ( - self.staleness_threshold + 1) * config.async_training.trigger_parameter_sync_step - print(f"[FullyAsyncRollouter] required_samples : {self.required_samples} " - f"max_required_samples: {self.max_required_samples}") + self.max_required_samples = ( + self.required_samples * (self.staleness_threshold + 1) * config.async_training.trigger_parameter_sync_step + ) + print( + f"[FullyAsyncRollouter] required_samples : {self.required_samples} " + f"max_required_samples: {self.max_required_samples}" + ) # 单次最多扔一次迭代需要的样本 self.max_concurrent_samples = self.required_samples @@ -184,8 +187,10 @@ async def update_param_version(self, version: int): self.current_param_version = version # every time param change, reset staleness_samples self.staleness_samples = 0 - print(f"[FullyAsyncRollouter][Public][update_param_version] " - f"Parameter version updated from {old_version} to {version}") + print( + f"[FullyAsyncRollouter][Public][update_param_version] " + f"Parameter version updated from {old_version} to {version}" + ) def _validate_config(self): # Validate asynchronous training configuration @@ -278,8 +283,7 @@ async def _feed_samples(self): # 发送结束信号 await self.pending_queue.put("DONE") - print(f"[FullyAsyncRollouter][Feed] " - f"样本添加完成,总共添加了 {self.global_steps} 个步骤的样本") + print(f"[FullyAsyncRollouter][Feed] 样本添加完成,总共添加了 {self.global_steps} 个步骤的样本") async def _processor_worker(self): """流式处理工作协程 - 逐个样本立即提交处理,不等待批次""" @@ -541,8 +545,8 @@ async def _should_pause_generation(self) -> bool: if queue_size >= self.max_queue_size: print( - "[FullyAsyncRollouter][ShouldPause] " - f" due to full queue: size={queue_size}, max={self.max_queue_size}") + f"[FullyAsyncRollouter][ShouldPause] due to full queue: size={queue_size}, max={self.max_queue_size}" + ) return True if self.staleness_samples > self.max_required_samples: diff --git a/recipe/fully_async_policy/fully_async_trainer.py b/recipe/fully_async_policy/fully_async_trainer.py index cc0c2378021..402ccf6c926 100644 --- a/recipe/fully_async_policy/fully_async_trainer.py +++ b/recipe/fully_async_policy/fully_async_trainer.py @@ -50,16 +50,16 @@ class FullyAsyncTrainer(RayPPOTrainer): """ def __init__( - self, - config, - tokenizer, - role_worker_mapping: dict[Role, WorkerType], - resource_pool_manager: ResourcePoolManager, - ray_worker_group_cls: RayWorkerGroup = RayWorkerGroup, - processor=None, - reward_fn=None, - val_reward_fn=None, - device_name=None, + self, + config, + tokenizer, + role_worker_mapping: dict[Role, WorkerType], + resource_pool_manager: ResourcePoolManager, + ray_worker_group_cls: RayWorkerGroup = RayWorkerGroup, + processor=None, + reward_fn=None, + val_reward_fn=None, + device_name=None, ): # Store the tokenizer for text processing self.tokenizer = tokenizer @@ -309,9 +309,11 @@ def fit(self): pprint(metrics) # Trigger parameter synchronization after training step - print(f"[FullyAsyncTrainer] global_steps: {self.global_steps} " - f"local_trigger_step: {self.local_trigger_step} " - f"trigger_parameter_sync_step: {self.trigger_parameter_sync_step}") + print( + f"[FullyAsyncTrainer] global_steps: {self.global_steps} " + f"local_trigger_step: {self.local_trigger_step} " + f"trigger_parameter_sync_step: {self.trigger_parameter_sync_step}" + ) self._trigger_parameter_sync_after_step() self.global_steps += 1 From 5f1302e26bb80fab0ac6bd2b76b0c06b7115fbfb Mon Sep 17 00:00:00 2001 From: ArronHZG Date: Mon, 18 Aug 2025 20:36:30 +0800 Subject: [PATCH 063/182] fully log --- recipe/fully_async_policy/detach_utils.py | 16 ++- .../fully_async_rollouter.py | 7 + .../fully_async_policy/fully_async_trainer.py | 134 ++++-------------- recipe/fully_async_policy/message_queue.py | 2 +- 4 files changed, 44 insertions(+), 115 deletions(-) diff --git a/recipe/fully_async_policy/detach_utils.py b/recipe/fully_async_policy/detach_utils.py index a76f42d7362..426a51ae35e 100644 --- a/recipe/fully_async_policy/detach_utils.py +++ b/recipe/fully_async_policy/detach_utils.py @@ -163,18 +163,22 @@ def assemble_batch_from_rollout_samples( # 收集统计信息和元数据(直接从 RolloutSample 中获取) param_versions = [rs.param_version for rs in rollout_samples] - sample_timestamps = [rs.generation_timestamp for rs in rollout_samples] + + processing_time_stats = { + "avg_processing_time": np.mean(processing_times), + "max_processing_time": np.max(processing_times), + "min_processing_time": np.min(processing_times), + "tp50_processing_time": np.percentile(processing_times, 50), # 中位数 + "tp99_processing_time": np.percentile(processing_times, 99), # 99百分位 + "tp95_processing_time": np.percentile(processing_times, 95), # 95百分位也很有用 + } # 创建 meta_info final_batch.meta_info.update( { "rollout_param_versions": param_versions, - "sample_timestamps": sample_timestamps, - "avg_processing_time": np.mean(processing_times) if processing_times else 0, - "max_processing_time": np.max(processing_times) if processing_times else 0, "param_version_diversity": len(set(param_versions)) if param_versions else 0, - "avg_sample_age": np.mean([time.time() - ts for ts in sample_timestamps]) if sample_timestamps else 0, - "assembly_time": time.time() - start_time, + **processing_time_stats, } ) diff --git a/recipe/fully_async_policy/fully_async_rollouter.py b/recipe/fully_async_policy/fully_async_rollouter.py index ac1253454d7..cad1542afc4 100644 --- a/recipe/fully_async_policy/fully_async_rollouter.py +++ b/recipe/fully_async_policy/fully_async_rollouter.py @@ -241,6 +241,8 @@ async def _feed_samples(self): sample_count = 0 should_stop = False + progress_bar = tqdm(total=self.total_rollout_steps, initial=self.global_steps, desc="Training Progress") + for epoch, batch_dict in continuous_iterator: if should_stop: # 检查停止标志 break @@ -277,11 +279,13 @@ async def _feed_samples(self): should_stop = True # 设置停止标志 break + progress_bar.update(1) self.global_steps += 1 sample_count += 1 # 发送结束信号 + progress_bar.close() await self.pending_queue.put("DONE") print(f"[FullyAsyncRollouter][Feed] 样本添加完成,总共添加了 {self.global_steps} 个步骤的样本") @@ -597,6 +601,9 @@ async def get_statistics(self) -> dict: "pending_queue_size": self.pending_queue.qsize(), "active_tasks_size": len(self.active_tasks), "result_queue_size": self.result_queue.qsize(), + "max_required_samples": self.max_required_samples, + "required_samples": self.required_samples, + "staleness_threshold": self.staleness_threshold, } return stats diff --git a/recipe/fully_async_policy/fully_async_trainer.py b/recipe/fully_async_policy/fully_async_trainer.py index 402ccf6c926..ed18e209c96 100644 --- a/recipe/fully_async_policy/fully_async_trainer.py +++ b/recipe/fully_async_policy/fully_async_trainer.py @@ -146,7 +146,7 @@ def _get_samples_from_queue(self) -> tuple[None, None] | tuple[int, Any]: while len(queue_samples) < self.required_samples: # 获取单个样本,会一直等待直到有样本或收到None - sample = self.message_queue_client.get_sample_sync() + sample, queue_len = self.message_queue_client.get_sample_sync() if sample is None: # 检测到结束信号(None),立即退出 @@ -159,7 +159,9 @@ def _get_samples_from_queue(self) -> tuple[None, None] | tuple[int, Any]: queue_samples.append(sample) if len(queue_samples) % 10 == 0: - print(f"[FullyAsyncTrainer] Collected {len(queue_samples)}/{self.required_samples} samples") + print( + f"[FullyAsyncTrainer] Collected {len(queue_samples)}/{self.required_samples} samples. mq_len: {queue_len}" + ) consumer_end = time.time() @@ -256,58 +258,36 @@ def fit(self): if batch is None: break - # 更新统计信息 - self.processed_samples += len(batch) if isinstance(batch, list) else 1 - # 从meta_info中获取参数版本信息 if hasattr(batch, "meta_info") and batch.meta_info: - # meta_info={'metrics': [{'generate_sequences': 1.8240885734558105, 'tool_calls': 0.0}, - # {'generate_sequences': 2.5197629928588867, 'tool_calls': 0.0}, - # {'generate_sequences': 3.5084900856018066, 'tool_calls': 0.0}, - # {'generate_sequences': 2.4329097270965576, 'tool_calls': 0.0}, - # {'generate_sequences': 3.0567924976348877, 'tool_calls': 0.0}, - # {'generate_sequences': 4.271160840988159, 'tool_calls': 0.0}], - # 'global_steps': 22, - # 'global_token_num': [588, 517, 422, 406, 355, 288], - # 'rollout_param_versions': [0, 0, 0, 0, 0, 0], - # 'sample_timestamps': [1755278023.7771623, 1755278024.101492, 1755278024.3597627, - # 1755278024.4885263, 1755278025.1039019, 1755278025.555585], - # 'avg_processing_time': 2.935534119606018, - # 'max_processing_time': 4.271160840988159, - # 'param_version_diversity': 1, - # 'avg_sample_age': 1.0503787994384766, - # 'assembly_time': 0.05373978614807129}) - rollout_param_versions = batch.meta_info.get("rollout_param_versions", []) - if rollout_param_versions: - # 统计陈旧样本 - stale_count = sum(1 for v in rollout_param_versions if self.current_param_version - v > 1) - self.stale_samples_processed += stale_count - - # 添加新鲜度指标到metrics - if rollout_param_versions: - param_version_diversity = batch.meta_info.get("param_version_diversity", 0) - avg_sample_age = batch.meta_info.get("avg_sample_age", 0) - - metrics.update( - { - "freshness/param_version_diversity": param_version_diversity, - "freshness/avg_sample_age": avg_sample_age, - "freshness/stale_samples_ratio": stale_count / len(rollout_param_versions) - if rollout_param_versions - else 0, - "statistics/processed_samples": self.processed_samples, - "statistics/stale_samples_processed": self.stale_samples_processed, - "statistics/current_param_version": self.current_param_version, - } - ) + # 统计陈旧样本 + rollout_param_versions = batch.meta_info["rollout_param_versions"] + stale_count = sum(1 for v in rollout_param_versions if self.current_param_version - v > 1) + self.stale_samples_processed += stale_count + metrics.update( + { + "fully_async/stale_samples_ratio": stale_count / len(rollout_param_versions), + "fully_async/stale_samples_processed": self.stale_samples_processed, + "fully_async/current_param_version": self.current_param_version, + } + ) + for metric in [ + "avg_processing_time", + "max_processing_time", + "min_processing_time", + "tp50_processing_time", + "tp99_processing_time", + "tp95_processing_time", + "param_version_diversity", + ]: + metrics[f"fully_async/{metric}"] = batch.meta_info.get(metric, 0) + batch, reward_extra_infos_dict = self._process_batch_common(batch, metrics, timing_raw) self._log_rollout(batch, reward_extra_infos_dict, timing_raw) self._check_save_checkpoint(False, timing_raw) # self._collect_metrics(batch, epoch, metrics, timing_raw) - pprint(metrics) - # Trigger parameter synchronization after training step print( f"[FullyAsyncTrainer] global_steps: {self.global_steps} " @@ -330,65 +310,3 @@ def _trigger_parameter_sync_after_step(self): else: self.local_trigger_step += 1 return - - def get_statistics(self) -> dict: - """Get training statistics""" - queue_stats = self.message_queue_client.get_statistics_sync() if self.message_queue_client else {} - return { - "global_steps": self.global_steps, - "processed_samples": self.processed_samples, - "stale_samples_processed": self.stale_samples_processed, - "current_param_version": self.current_param_version, - "queue_size": queue_stats.get("queue_size", 0), - "queue_total_produced": queue_stats.get("total_produced", 0), - "queue_total_consumed": queue_stats.get("total_consumed", 0), - "queue_dropped_samples": queue_stats.get("dropped_samples", 0), - } - - def _compute_sample_freshness_metrics(self, rollout_samples: list[RolloutSample]) -> dict: - """ - Compute sample freshness metrics - - Args: - rollout_samples: List of RolloutSample objects - - Returns: - dict: Dictionary of freshness metrics - """ - if not rollout_samples: - return {} - - try: - # Extract parameter versions and timestamps directly from RolloutSample - sample_ages = [] - sample_latencies = [] - current_time = time.time() - - for sample in rollout_samples: - # Get information directly from RolloutSample - rollout_version = sample.param_version - generation_time = sample.generation_timestamp - - age = max(0, self.current_param_version - rollout_version) - latency = max(0, current_time - generation_time) - - sample_ages.append(age) - sample_latencies.append(latency) - - if not sample_ages: - return {} - - return { - "freshness/avg_sample_age": np.mean(sample_ages), - "freshness/max_sample_age": max(sample_ages), - "freshness/min_sample_age": min(sample_ages), - "freshness/avg_sample_latency": np.mean(sample_latencies), - "freshness/max_sample_latency": max(sample_latencies), - "freshness/min_sample_latency": min(sample_latencies), - "freshness/stale_samples_ratio": sum(1 for age in sample_ages if age > 1) / len(sample_ages), - "freshness/sample_count": len(sample_ages), - } - - except Exception as e: - logger.error(f"Error computing freshness metrics: {e}") - return {"freshness/error": str(e)} diff --git a/recipe/fully_async_policy/message_queue.py b/recipe/fully_async_policy/message_queue.py index 2a12cb21c90..012445d45ed 100644 --- a/recipe/fully_async_policy/message_queue.py +++ b/recipe/fully_async_policy/message_queue.py @@ -114,7 +114,7 @@ async def get_sample(self) -> Any | None: # Get one sample data = self.queue.popleft() self.total_consumed += 1 - return data + return data, len(self.queue) async def update_param_version(self, version: int): """Update current parameter version""" From 42789e85be3ab32dbbc52d4bbeb9cb3353e7f6bb Mon Sep 17 00:00:00 2001 From: ArronHZG Date: Mon, 18 Aug 2025 20:51:23 +0800 Subject: [PATCH 064/182] fully async log --- .../fully_async_rollouter.py | 11 +- .../fully_async_policy/fully_async_trainer.py | 5 +- .../unittest/ray_async_resource_config.py | 4 +- .../unittest/test_asyncio_message_queue.py | 407 ------------------ .../unittest/test_batch_utils.py | 278 +----------- 5 files changed, 12 insertions(+), 693 deletions(-) delete mode 100644 recipe/fully_async_policy/unittest/test_asyncio_message_queue.py diff --git a/recipe/fully_async_policy/fully_async_rollouter.py b/recipe/fully_async_policy/fully_async_rollouter.py index cad1542afc4..f349b5e06ed 100644 --- a/recipe/fully_async_policy/fully_async_rollouter.py +++ b/recipe/fully_async_policy/fully_async_rollouter.py @@ -17,6 +17,7 @@ import ray from omegaconf import OmegaConf +from tqdm import tqdm from recipe.fully_async_policy.detach_utils import ( RolloutSample, @@ -298,7 +299,7 @@ async def _processor_worker(self): async with self.lock: if await self._should_pause_generation(): - print(f"[FullyAsyncRollouter][Processor] 等待已提交的任务结束") + print("[FullyAsyncRollouter][Processor] 等待已提交的任务结束") if self.active_tasks: await asyncio.gather(*self.active_tasks, return_exceptions=True) self.active_tasks.clear() @@ -309,7 +310,7 @@ async def _processor_worker(self): # 获取待处理的部分 RolloutSample async with self.lock: if partial_rollout_sample == "DONE": - print(f"[FullyAsyncRollouter][Processor] 收到结束信号,等待剩余任务完成...") + print("[FullyAsyncRollouter][Processor] 收到结束信号,等待剩余任务完成...") # 等待所有活动任务完成 if self.active_tasks: await asyncio.gather(*self.active_tasks, return_exceptions=True) @@ -524,7 +525,7 @@ async def _async_monitor_loop(self): # pause 和 resume 直接,不进行恢复操作 if self.monitor_loop_trigger and not await self._should_pause_generation(): async with self.lock: - print(f"[FullyAsyncRollouter][MonitorLoop] trigger resume") + print("[FullyAsyncRollouter][MonitorLoop] trigger resume") self.paused = False self.condition.notify_all() @@ -571,7 +572,7 @@ async def pause(self): async with self.lock: self.paused = True if self.active_tasks: - print(f"[FullyAsyncRollouter][Public][Pause]") + print("[FullyAsyncRollouter][Public][Pause]") await asyncio.gather(*self.active_tasks, return_exceptions=True) self.active_tasks.clear() print("[FullyAsyncRollouter][Public][Pause] All active tasks completed") @@ -595,7 +596,7 @@ async def get_statistics(self) -> dict: "total_generated_samples": self.total_generated_samples, "staleness_samples": self.staleness_samples, "dropped_stale_samples": self.dropped_stale_samples, - "queue_max_size": self.max_queue_size, + "max_queue_size": self.max_queue_size, "queue_size": queue_stats["queue_size"], "max_concurrent_samples": self.max_concurrent_samples, "pending_queue_size": self.pending_queue.qsize(), diff --git a/recipe/fully_async_policy/fully_async_trainer.py b/recipe/fully_async_policy/fully_async_trainer.py index ed18e209c96..b82b1c4d5d2 100644 --- a/recipe/fully_async_policy/fully_async_trainer.py +++ b/recipe/fully_async_policy/fully_async_trainer.py @@ -18,12 +18,10 @@ from pprint import pprint from typing import Any -import numpy as np import ray from omegaconf import OmegaConf from recipe.fully_async_policy.detach_utils import ( - RolloutSample, assemble_batch_from_rollout_samples, calculate_one_step_size, ) @@ -160,7 +158,8 @@ def _get_samples_from_queue(self) -> tuple[None, None] | tuple[int, Any]: if len(queue_samples) % 10 == 0: print( - f"[FullyAsyncTrainer] Collected {len(queue_samples)}/{self.required_samples} samples. mq_len: {queue_len}" + f"[FullyAsyncTrainer] Collected {len(queue_samples)}/{self.required_samples} samples. " + f"mq_len: {queue_len}" ) consumer_end = time.time() diff --git a/recipe/fully_async_policy/unittest/ray_async_resource_config.py b/recipe/fully_async_policy/unittest/ray_async_resource_config.py index 40e85c9f1bd..930f8c5169f 100644 --- a/recipe/fully_async_policy/unittest/ray_async_resource_config.py +++ b/recipe/fully_async_policy/unittest/ray_async_resource_config.py @@ -349,10 +349,10 @@ async def main(): # 压力测试 await run_resource_stress_test() - print("\n✅ 所有测试完成!") + print("\n所有测试完成!") except Exception as e: - print(f"❌ 测试执行失败: {e}") + print(f"测试执行失败: {e}") import traceback traceback.print_exc() diff --git a/recipe/fully_async_policy/unittest/test_asyncio_message_queue.py b/recipe/fully_async_policy/unittest/test_asyncio_message_queue.py deleted file mode 100644 index 33e0d9db04d..00000000000 --- a/recipe/fully_async_policy/unittest/test_asyncio_message_queue.py +++ /dev/null @@ -1,407 +0,0 @@ -# Copyright 2025 Meituan Ltd. and/or its affiliates -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -# 测试使用 asyncio 的 MessageQueue -# 对比 @ray.remote(num_cpus, max_concurrency) 参数的实际效果 - -import asyncio -import random - -# 导入修改后的 MessageQueue -import time -from dataclasses import dataclass - -import ray -from omegaconf import DictConfig - -from recipe.fully_async_policy.message_queue import MessageQueue, MessageQueueClient, QueueSample - - -@dataclass -class TestConfig: - """测试配置""" - - async_training: dict - - -def create_test_config() -> DictConfig: - """创建测试配置""" - from omegaconf import OmegaConf - - config_dict = {"async_training": {"staleness_threshold": 3}} - return OmegaConf.create(config_dict) - - -class AsyncMessageQueueTester: - """异步消息队列测试器""" - - def __init__(self): - self.config = create_test_config() - - async def test_basic_async_operations(self): - """测试基本异步操作""" - print("\n🧪 测试基本异步操作") - print("=" * 50) - - # 创建MessageQueue Actor - queue_actor = MessageQueue.remote(self.config, max_queue_size=100) - client = MessageQueueClient(queue_actor) - - # 测试异步放入样本 - test_samples = [ - QueueSample( - data={"task_id": f"task_{i}", "content": f"测试数据_{i}"}, - rollout_metadata={"timestamp": time.time(), "version": 1}, - ) - for i in range(10) - ] - - # 异步并发放入样本 - put_tasks = [] - for i, sample in enumerate(test_samples): - task = asyncio.create_task(client.put_sample(sample, param_version=1), name=f"put_task_{i}") - put_tasks.append(task) - - # 等待所有放入任务完成 - put_results = await asyncio.gather(*put_tasks) - successful_puts = sum(put_results) - - print(f"✅ 成功放入 {successful_puts}/{len(test_samples)} 个样本") - - # 异步获取统计信息 - stats = await client.get_statistics() - print(f"📊 队列统计: {stats}") - - # 异步获取样本 - samples_batch, queue_size = await client.get_samples(min_batch_count=5) - print(f"📦 获取了 {len(samples_batch)} 个样本,剩余队列大小: {queue_size}") - - # 清理 - await client.shutdown() - - return successful_puts - - async def test_concurrent_producers_consumers(self): - """测试并发生产者和消费者""" - print("\n🏭 测试并发生产者和消费者") - print("=" * 50) - - # 创建 MessageQueue Actor - queue_actor = MessageQueue.remote(self.config, max_queue_size=200) - client = MessageQueueClient(queue_actor) - - # 生产者协程 - async def producer(producer_id: int, sample_count: int): - """生产者协程""" - produced = 0 - for i in range(sample_count): - sample = QueueSample( - data={ - "producer_id": producer_id, - "task_id": f"producer_{producer_id}_task_{i}", - "content": f"来自生产者{producer_id}的数据{i}", - }, - rollout_metadata={"producer_timestamp": time.time(), "producer_id": producer_id}, - ) - - success = await client.put_sample(sample, param_version=1) - if success: - produced += 1 - - # 模拟生产间隔 - await asyncio.sleep(random.uniform(0.01, 0.1)) - - print(f"🏭 生产者{producer_id} 完成,成功生产 {produced} 个样本") - return produced - - # 消费者协程 - async def consumer(consumer_id: int, target_count: int): - """消费者协程""" - consumed = 0 - start_time = time.time() - - while consumed < target_count: - try: - # 尝试获取样本,设置超时 - sample = await asyncio.wait_for(client.get_sample(), timeout=2.0) - - if sample is not None: - consumed += 1 - - if consumed % 10 == 0: - print(f"🍽️ 消费者{consumer_id} 已消费 {consumed} 个样本") - else: - print(f"⚠️ 消费者{consumer_id} 收到空样本,队列可能已关闭") - break - - except asyncio.TimeoutError: - print(f"⏰ 消费者{consumer_id} 超时,检查队列状态...") - stats = await client.get_statistics() - if stats["queue_size"] == 0: - print(f"📭 队列为空,消费者{consumer_id} 等待...") - await asyncio.sleep(0.5) - continue - - # 模拟处理时间 - await asyncio.sleep(random.uniform(0.02, 0.05)) - - elapsed = time.time() - start_time - print(f"🍽️ 消费者{consumer_id} 完成,消费了 {consumed} 个样本,耗时 {elapsed:.2f}s") - return consumed - - # 启动并发生产者和消费者 - num_producers = 3 - num_consumers = 2 - samples_per_producer = 20 - - # 创建生产者任务 - producer_tasks = [ - asyncio.create_task(producer(i, samples_per_producer), name=f"producer_{i}") for i in range(num_producers) - ] - - # 创建消费者任务 - total_expected_samples = num_producers * samples_per_producer - samples_per_consumer = total_expected_samples // num_consumers - - consumer_tasks = [ - asyncio.create_task( - consumer(i, samples_per_consumer + (5 if i == 0 else 0)), # 第一个消费者多处理一些 - name=f"consumer_{i}", - ) - for i in range(num_consumers) - ] - - # 等待所有任务完成 - start_time = time.time() - - producer_results = await asyncio.gather(*producer_tasks, return_exceptions=True) - consumer_results = await asyncio.gather(*consumer_tasks, return_exceptions=True) - - end_time = time.time() - - # 统计结果 - total_produced = sum(r for r in producer_results if isinstance(r, int)) - total_consumed = sum(r for r in consumer_results if isinstance(r, int)) - - print("\n📈 并发测试结果:") - print(f" 总生产样本: {total_produced}") - print(f" 总消费样本: {total_consumed}") - print(f" 总耗时: {end_time - start_time:.2f}s") - print(f" 生产效率: {total_produced / (end_time - start_time):.2f} samples/s") - print(f" 消费效率: {total_consumed / (end_time - start_time):.2f} samples/s") - - # 最终统计 - final_stats = await client.get_statistics() - print(f"📊 最终队列统计: {final_stats}") - - # 清理 - await client.shutdown() - - return total_produced, total_consumed - - async def compare_resource_configurations(self): - """对比不同资源配置的效果""" - print("\n⚡ 对比不同资源配置的效果") - print("=" * 50) - - # 测试配置列表 - configs = [ - {"name": "默认配置", "num_cpus": None, "max_concurrency": None, "decorator": ray.remote}, - { - "name": "高CPU低并发", - "num_cpus": 4, - "max_concurrency": 5, - "decorator": lambda: ray.remote(num_cpus=4, max_concurrency=5), - }, - { - "name": "低CPU高并发", - "num_cpus": 1, - "max_concurrency": 20, - "decorator": lambda: ray.remote(num_cpus=1, max_concurrency=20), - }, - { - "name": "平衡配置", - "num_cpus": 2, - "max_concurrency": 10, - "decorator": lambda: ray.remote(num_cpus=2, max_concurrency=10), - }, - ] - - results = {} - - for config in configs: - print(f"\n🧪 测试配置: {config['name']}") - print(f" num_cpus: {config['num_cpus']}") - print(f" max_concurrency: {config['max_concurrency']}") - - # 动态创建MessageQueue类 - if config["num_cpus"] is None: - QueueClass = MessageQueue - else: - QueueClass = config["decorator"]()(MessageQueue) - - # 创建queue实例 - queue_actor = QueueClass.remote(self.config, max_queue_size=100) - client = MessageQueueClient(queue_actor) - - # 执行性能测试 - start_time = time.time() - - # 并发放入大量样本 - sample_count = 50 - put_tasks = [] - - for i in range(sample_count): - sample = QueueSample( - data={ - "task_id": f"perf_test_{i}", - "config": config["name"], - "data_size": random.randint(100, 1000), - }, - rollout_metadata={"config_test": True}, - ) - - task = asyncio.create_task(client.put_sample(sample, param_version=1)) - put_tasks.append(task) - - # 模拟流式到达 - if i % 10 == 0: - await asyncio.sleep(0.01) - - # 等待所有put完成 - put_results = await asyncio.gather(*put_tasks) - put_time = time.time() - start_time - - # 获取所有样本 - get_start_time = time.time() - all_samples = [] - - while True: - samples_batch, queue_size = await client.get_samples(min_batch_count=1) - if not samples_batch: - break - all_samples.extend(samples_batch) - - if queue_size == 0: - break - - get_time = time.time() - get_start_time - total_time = time.time() - start_time - - successful_puts = sum(put_results) - - # 记录结果 - results[config["name"]] = { - "successful_puts": successful_puts, - "retrieved_samples": len(all_samples), - "put_time": put_time, - "get_time": get_time, - "total_time": total_time, - "put_throughput": successful_puts / put_time if put_time > 0 else 0, - "get_throughput": len(all_samples) / get_time if get_time > 0 else 0, - "total_throughput": (successful_puts + len(all_samples)) / total_time if total_time > 0 else 0, - } - - print(f" ✅ 放入: {successful_puts}/{sample_count}") - print(f" 📦 获取: {len(all_samples)}") - print(f" ⏱️ 放入耗时: {put_time:.3f}s") - print(f" ⏱️ 获取耗时: {get_time:.3f}s") - print(f" 🚀 放入吞吐量: {successful_puts / put_time:.2f} ops/s") - - # 清理 - await client.shutdown() - - # 间隔 - await asyncio.sleep(1) - - # 生成对比报告 - print("\n📊 资源配置对比报告") - print("=" * 80) - print(f"{'配置名称':<15} {'放入吞吐量':<12} {'获取吞吐量':<12} {'总吞吐量':<12} {'总耗时':<10}") - print("-" * 80) - - best_config = "" - best_throughput = 0 - - for config_name, result in results.items(): - put_throughput = result["put_throughput"] - get_throughput = result["get_throughput"] - total_throughput = result["total_throughput"] - total_time = result["total_time"] - - print( - f"{config_name:<15} {put_throughput:<12.2f} {get_throughput:<12.2f} " - f"{total_throughput:<12.2f} {total_time:<10.3f}s" - ) - - if total_throughput > best_throughput: - best_throughput = total_throughput - best_config = config_name - - print(f"\n🏆 最佳配置: {best_config} (总吞吐量: {best_throughput:.2f} ops/s)") - - return results - - -async def main(): - """主函数""" - # 初始化Ray - if not ray.is_initialized(): - ray.init( - num_cpus=8, - object_store_memory=1000000000, # 1GB - ignore_reinit_error=True, - ) - - print("🎯 异步MessageQueue测试") - print(f"Ray集群资源: {ray.cluster_resources()}") - - tester = AsyncMessageQueueTester() - - try: - # 基本异步操作测试 - await tester.test_basic_async_operations() - - # 并发生产者消费者测试 - await tester.test_concurrent_producers_consumers() - - # 资源配置对比测试 - await tester.compare_resource_configurations() - - print("\n✅ 所有测试完成!") - - # 总结 - print("\n📋 总结:") - print("1. 使用 asyncio 后的优势:") - print(" - 真正的异步等待,不阻塞事件循环") - print(" - 更好的并发性能") - print(" - 与Ray的异步接口完美集成") - - print("\n2. 资源配置建议:") - print(" - num_cpus: 控制CPU资源分配,影响计算密集型任务") - print(" - max_concurrency: 控制并发数,影响I/O密集型任务") - print(" - 对于MessageQueue: 推荐 num_cpus=2, max_concurrency=20") - - except Exception as e: - print(f"❌ 测试失败: {e}") - import traceback - - traceback.print_exc() - - finally: - ray.shutdown() - - -if __name__ == "__main__": - asyncio.run(main()) diff --git a/recipe/fully_async_policy/unittest/test_batch_utils.py b/recipe/fully_async_policy/unittest/test_batch_utils.py index ddde3a4ad92..b9351c46c28 100644 --- a/recipe/fully_async_policy/unittest/test_batch_utils.py +++ b/recipe/fully_async_policy/unittest/test_batch_utils.py @@ -98,282 +98,8 @@ def create_mock_rollout_sample(self, sample_id: str, param_version: int = 1) -> """创建测试用的 RolloutSample""" # 创建 mock AgentLoopOutput agent_loop_output = MockAgentLoopOutput( - prompt_ids=[ - 151644, - 8948, - 198, - 2610, - 525, - 1207, - 16948, - 11, - 3465, - 553, - 54364, - 14817, - 13, - 1446, - 525, - 264, - 10950, - 17847, - 13, - 151645, - 198, - 151644, - 872, - 198, - 24732, - 21189, - 264, - 400, - 16, - 17, - 40358, - 817, - 2254, - 13, - 758, - 279, - 1156, - 2003, - 11, - 566, - 37102, - 264, - 4843, - 315, - 432, - 26, - 304, - 279, - 2086, - 2003, - 11, - 566, - 37102, - 264, - 8338, - 315, - 1128, - 566, - 702, - 2115, - 13, - 2585, - 1753, - 3220, - 1558, - 566, - 614, - 2115, - 311, - 6248, - 279, - 2254, - 30, - 6771, - 594, - 1744, - 3019, - 553, - 3019, - 323, - 2550, - 279, - 1590, - 4226, - 1283, - 330, - 820, - 3263, - 151645, - 198, - 151644, - 77091, - 198, - ], - response_ids=[ - 14374, - 14822, - 14319, - 12, - 8304, - 74216, - 510, - 16, - 13, - 4127, - 40358, - 25, - 400, - 16, - 17, - 198, - 17, - 13, - 5512, - 2003, - 18024, - 510, - 262, - 481, - 8364, - 37102, - 264, - 4843, - 315, - 279, - 400, - 16, - 17, - 624, - 262, - 481, - 25783, - 7391, - 284, - 57960, - 37018, - 90, - 16, - 15170, - 18, - 92, - 1124, - 15136, - 32882, - 16, - 17, - 284, - 32882, - 19, - 66426, - 18, - 13, - 10657, - 3311, - 1283, - 1156, - 2003, - 25, - 400, - 16, - 17, - 481, - 32882, - 19, - 284, - 32882, - 23, - 66426, - 19, - 13, - 10440, - 2003, - 18024, - 510, - 262, - 481, - 8364, - 37102, - 264, - 8338, - 315, - 279, - 9664, - 3311, - 1283, - 279, - 1156, - 2003, - 624, - 262, - 481, - 11487, - 2115, - 284, - 400, - 23, - 481, - 400, - 19, - 284, - 400, - 19, - 198, - 262, - 481, - 25783, - 7391, - 2049, - 57960, - 37018, - 90, - 16, - 15170, - 19, - 92, - 1124, - 15136, - 32882, - 19, - 284, - 32882, - 16, - 66426, - 20, - 13, - 13023, - 3311, - 2115, - 510, - 262, - 481, - 8364, - 702, - 3322, - 369, - 264, - 2480, - 2003, - 311, - 6248, - 279, - 2254, - 2041, - 32821, - 894, - 803, - 40358, - 382, - 43434, - 510, - 24732, - 702, - 3070, - 65039, - 23, - 334, - 2115, - 13, - 1260, - 686, - 614, - 3322, - 3220, - 311, - 6248, - 279, - 2254, - 2041, - 32821, - 894, - 803, - 40358, - 13, - 151645, - ], + prompt_ids=torch.randint(0, 32000, (175,)).tolist(), + response_ids=torch.randint(0, 32000, (175,)).tolist(), response_mask=[1] * 175, # 真实的response长度 num_turns=2, metrics=MockAgentLoopMetrics(generate_sequences=1.6468379497528076, tool_calls=0.0), From a1c0f5c6edfb2d3f8a938fcc43c25e921ab7377c Mon Sep 17 00:00:00 2001 From: ArronHZG Date: Mon, 18 Aug 2025 20:52:18 +0800 Subject: [PATCH 065/182] ruff format --- .../rollout/vllm_rollout/vllm_rollout_spmd.py | 22 +++++++++---------- 1 file changed, 11 insertions(+), 11 deletions(-) diff --git a/verl/workers/rollout/vllm_rollout/vllm_rollout_spmd.py b/verl/workers/rollout/vllm_rollout/vllm_rollout_spmd.py index 0d419dcf177..071dd917119 100644 --- a/verl/workers/rollout/vllm_rollout/vllm_rollout_spmd.py +++ b/verl/workers/rollout/vllm_rollout/vllm_rollout_spmd.py @@ -109,11 +109,11 @@ def __init__(self, model_path: str, config: DictConfig, tokenizer, model_hf_conf if hasattr(model_hf_config, "max_position_embeddings"): max_position_embeddings = model_hf_config.max_position_embeddings elif hasattr(model_hf_config, "llm_config") and hasattr( - model_hf_config.llm_config, "max_position_embeddings" + model_hf_config.llm_config, "max_position_embeddings" ): max_position_embeddings = model_hf_config.llm_config.max_position_embeddings elif hasattr(model_hf_config, "text_config") and hasattr( - model_hf_config.text_config, "max_position_embeddings" + model_hf_config.text_config, "max_position_embeddings" ): max_position_embeddings = model_hf_config.text_config.max_position_embeddings if max_position_embeddings is None: @@ -128,12 +128,12 @@ def __init__(self, model_path: str, config: DictConfig, tokenizer, model_hf_conf rope_scaling_factor = rope_scaling_config.get("factor", 1.0) assert ( - model_hf_config.max_position_embeddings * rope_scaling_factor - >= config.prompt_length + config.response_length + model_hf_config.max_position_embeddings * rope_scaling_factor + >= config.prompt_length + config.response_length ), ( - "model context length should be greater than total sequence length, " - + f"got rope_scaling_factor={rope_scaling_factor} and " - + f"max_position_embeddings={model_hf_config.max_position_embeddings}" + "model context length should be greater than total sequence length, " + + f"got rope_scaling_factor={rope_scaling_factor} and " + + f"max_position_embeddings={model_hf_config.max_position_embeddings}" ) max_model_len = int(config.max_model_len or config.prompt_length + config.response_length) @@ -268,7 +268,7 @@ def generate_sequences(self, prompts: DataProto, **kwargs) -> DataProto: if "multi_modal_data" in non_tensor_batch: vllm_inputs = [] for raw_prompt_ids, multi_modal_data in zip( - non_tensor_batch.pop("raw_prompt_ids"), non_tensor_batch.pop("multi_modal_data"), strict=True + non_tensor_batch.pop("raw_prompt_ids"), non_tensor_batch.pop("multi_modal_data"), strict=True ): vllm_inputs.append({"prompt_token_ids": raw_prompt_ids, "multi_modal_data": multi_modal_data}) else: @@ -390,9 +390,9 @@ def _monkey_patch_compute_logits(model, vocab_size: int): original_compute_logits = model.compute_logits def compute_logits( - self, - hidden_states: torch.Tensor, - sampling_metadata: SamplingMetadata, + self, + hidden_states: torch.Tensor, + sampling_metadata: SamplingMetadata, ) -> torch.Tensor: logits = original_compute_logits(hidden_states, sampling_metadata) logits[..., vocab_size:] = float("-inf") From 26b55d96625e7e9c9f58f86618182b81ab2e5e4a Mon Sep 17 00:00:00 2001 From: ArronHZG Date: Tue, 19 Aug 2025 10:33:20 +0800 Subject: [PATCH 066/182] update log --- .../config/fully_async_ppo_trainer.yaml | 2 +- .../fully_async_rollouter.py | 42 ++++++++++--------- 2 files changed, 24 insertions(+), 20 deletions(-) diff --git a/recipe/fully_async_policy/config/fully_async_ppo_trainer.yaml b/recipe/fully_async_policy/config/fully_async_ppo_trainer.yaml index 9d0a8c67383..f1c4a1c602f 100644 --- a/recipe/fully_async_policy/config/fully_async_ppo_trainer.yaml +++ b/recipe/fully_async_policy/config/fully_async_ppo_trainer.yaml @@ -11,7 +11,7 @@ defaults: async_training: # 新鲜度控制 (Freshness Control) staleness_threshold: 3 # 样本新鲜度阈值 - trigger_parameter_sync_step: 10 # >=1 train 每次训练一个batch, 迭代多少次后触发更新 + trigger_parameter_sync_step: 32 # >=1 train 每次训练一个batch, 迭代多少次后触发更新 # Rollout配置 rollout: diff --git a/recipe/fully_async_policy/fully_async_rollouter.py b/recipe/fully_async_policy/fully_async_rollouter.py index f349b5e06ed..092ff2add17 100644 --- a/recipe/fully_async_policy/fully_async_rollouter.py +++ b/recipe/fully_async_policy/fully_async_rollouter.py @@ -39,16 +39,16 @@ class FullyAsyncRollouter(RayPPOTrainer): """ def __init__( - self, - config, - tokenizer, - role_worker_mapping: dict[Role, WorkerType], - resource_pool_manager: ResourcePoolManager, - ray_worker_group_cls: RayWorkerGroup = RayWorkerGroup, - processor=None, - reward_fn=None, - val_reward_fn=None, - device_name=None, + self, + config, + tokenizer, + role_worker_mapping: dict[Role, WorkerType], + resource_pool_manager: ResourcePoolManager, + ray_worker_group_cls: RayWorkerGroup = RayWorkerGroup, + processor=None, + reward_fn=None, + val_reward_fn=None, + device_name=None, ): # Store the tokenizer for text processing self.tokenizer = tokenizer @@ -136,7 +136,8 @@ def __init__( self.minimal_bsz, config.actor_rollout_ref.actor.ppo_mini_batch_size ) self.max_required_samples = ( - self.required_samples * (self.staleness_threshold + 1) * config.async_training.trigger_parameter_sync_step + self.required_samples * ( + self.staleness_threshold + 1) * config.async_training.trigger_parameter_sync_step ) print( f"[FullyAsyncRollouter] required_samples : {self.required_samples} " @@ -242,7 +243,8 @@ async def _feed_samples(self): sample_count = 0 should_stop = False - progress_bar = tqdm(total=self.total_rollout_steps, initial=self.global_steps, desc="Training Progress") + progress_bar = tqdm(total=self.total_rollout_steps / self.required_samples, initial=self.global_steps, + desc="Training Progress") for epoch, batch_dict in continuous_iterator: if should_stop: # 检查停止标志 @@ -280,7 +282,8 @@ async def _feed_samples(self): should_stop = True # 设置停止标志 break - progress_bar.update(1) + if self.global_steps % self.required_samples == 0: + progress_bar.update(1) self.global_steps += 1 sample_count += 1 @@ -363,7 +366,7 @@ async def _process_single_sample_streaming(self, partial_rollout_sample): if processing_time > self.max_processing_time: self.max_processing_time = processing_time - print(f"[FullyAsyncRollouter] rollout {partial_rollout_sample.sample_id} cost {processing_time:.2f}s") + # print(f"[FullyAsyncRollouter] rollout {partial_rollout_sample.sample_id} cost {processing_time:.2f}s") async def _consumer_worker(self): """消费者协程,负责从结果队列获取处理结果并放入消息队列""" @@ -523,11 +526,12 @@ async def _async_monitor_loop(self): last_stats_time = current_time # pause 和 resume 直接,不进行恢复操作 - if self.monitor_loop_trigger and not await self._should_pause_generation(): - async with self.lock: - print("[FullyAsyncRollouter][MonitorLoop] trigger resume") - self.paused = False - self.condition.notify_all() + if self.monitor_loop_trigger and self.paused: + if await self._should_pause_generation(): + async with self.lock: + print("[FullyAsyncRollouter][MonitorLoop] trigger resume") + self.paused = False + self.condition.notify_all() async def _should_pause_generation(self) -> bool: if self.paused: From 749d4df3e33bf19070f255d206444f334c764b11 Mon Sep 17 00:00:00 2001 From: ArronHZG Date: Wed, 20 Aug 2025 15:55:48 +0800 Subject: [PATCH 067/182] partial rollout --- recipe/fully_async_policy/detach_utils.py | 5 +- .../fully_async_rollouter.py | 74 ++++++++++--------- verl/experimental/agent_loop/__init__.py | 3 +- verl/experimental/agent_loop/agent_loop.py | 38 ++++++++-- .../partial_single_turn_agent_loop.py | 68 +++++++++++++++++ .../rollout/vllm_rollout/vllm_async_server.py | 52 ++++++++++++- 6 files changed, 194 insertions(+), 46 deletions(-) create mode 100644 verl/experimental/agent_loop/partial_single_turn_agent_loop.py diff --git a/recipe/fully_async_policy/detach_utils.py b/recipe/fully_async_policy/detach_utils.py index 426a51ae35e..3ac998bc82a 100644 --- a/recipe/fully_async_policy/detach_utils.py +++ b/recipe/fully_async_policy/detach_utils.py @@ -46,7 +46,6 @@ class RolloutSample: # Processing metadata processing_time: float - generation_timestamp: float param_version: int @@ -76,6 +75,10 @@ def prepare_single_generation_data(batch_dict, global_steps) -> DataProto: batch_keys=batch_keys_to_pop, non_tensor_batch_keys=non_tensor_batch_keys_to_pop, ) + + # 设置使用支持partial的agent + full_batch.non_tensor_batch["agent_name"] = np.array(["partial_single_turn_agent"] * len(full_batch), dtype=object) + # 添加全局步数到生成数据 full_batch.meta_info["global_steps"] = global_steps diff --git a/recipe/fully_async_policy/fully_async_rollouter.py b/recipe/fully_async_policy/fully_async_rollouter.py index 092ff2add17..2500a215d9a 100644 --- a/recipe/fully_async_policy/fully_async_rollouter.py +++ b/recipe/fully_async_policy/fully_async_rollouter.py @@ -39,16 +39,16 @@ class FullyAsyncRollouter(RayPPOTrainer): """ def __init__( - self, - config, - tokenizer, - role_worker_mapping: dict[Role, WorkerType], - resource_pool_manager: ResourcePoolManager, - ray_worker_group_cls: RayWorkerGroup = RayWorkerGroup, - processor=None, - reward_fn=None, - val_reward_fn=None, - device_name=None, + self, + config, + tokenizer, + role_worker_mapping: dict[Role, WorkerType], + resource_pool_manager: ResourcePoolManager, + ray_worker_group_cls: RayWorkerGroup = RayWorkerGroup, + processor=None, + reward_fn=None, + val_reward_fn=None, + device_name=None, ): # Store the tokenizer for text processing self.tokenizer = tokenizer @@ -136,8 +136,7 @@ def __init__( self.minimal_bsz, config.actor_rollout_ref.actor.ppo_mini_batch_size ) self.max_required_samples = ( - self.required_samples * ( - self.staleness_threshold + 1) * config.async_training.trigger_parameter_sync_step + self.required_samples * (self.staleness_threshold + 1) * config.async_training.trigger_parameter_sync_step ) print( f"[FullyAsyncRollouter] required_samples : {self.required_samples} " @@ -161,6 +160,7 @@ def __init__( self.pending_queue = asyncio.Queue(maxsize=100) self.active_tasks = set() self.result_queue = asyncio.Queue() + self.cancel_queue = asyncio.Queue() # 通过 pause 和 resume 控制 monitor_loop 中,是否进行 尝试恢复 操作 self.monitor_loop_trigger = True @@ -243,8 +243,9 @@ async def _feed_samples(self): sample_count = 0 should_stop = False - progress_bar = tqdm(total=self.total_rollout_steps / self.required_samples, initial=self.global_steps, - desc="Training Progress") + progress_bar = tqdm( + total=self.total_rollout_steps / self.required_samples, initial=self.global_steps, desc="Training Progress" + ) for epoch, batch_dict in continuous_iterator: if should_stop: # 检查停止标志 @@ -257,8 +258,7 @@ async def _feed_samples(self): for rollout_n_index in range(self.config.actor_rollout_ref.rollout.n): sample_id = f"sample_{epoch}_{sample_count}_{rollout_n_index}" - # 创建部分 RolloutSample,不包含 _gen_data(因为它不在数据类定义中) - partial_rollout_sample = RolloutSample( + rollout_sample = RolloutSample( full_batch=full_batch, agent_loop_output=None, # 待处理后填充 sample_id=sample_id, @@ -266,11 +266,10 @@ async def _feed_samples(self): rollout_n_index=rollout_n_index, original_sample_index=sample_count, processing_time=0.0, # 待处理后填充 - generation_timestamp=0.0, # 待处理后填充 param_version=0, # 待处理后填充 ) - await self.pending_queue.put(partial_rollout_sample) + await self.pending_queue.put(rollout_sample) # 检查是否到达最后一步 if self.global_steps >= self.total_rollout_steps: @@ -297,7 +296,11 @@ async def _processor_worker(self): """流式处理工作协程 - 逐个样本立即提交处理,不等待批次""" while True: - partial_rollout_sample = await self.pending_queue.get() + if not self.cancel_queue.empty(): + print(f"self.cancel_queue {self.cancel_queue.qsize()}") + rollout_sample = await self.cancel_queue.get() + else: + rollout_sample = await self.pending_queue.get() self.staleness_samples += 1 async with self.lock: @@ -312,7 +315,7 @@ async def _processor_worker(self): # 获取待处理的部分 RolloutSample async with self.lock: - if partial_rollout_sample == "DONE": + if rollout_sample == "DONE": print("[FullyAsyncRollouter][Processor] 收到结束信号,等待剩余任务完成...") # 等待所有活动任务完成 if self.active_tasks: @@ -337,37 +340,40 @@ async def _processor_worker(self): while self.paused: await self.condition.wait() task = asyncio.create_task( - self._process_single_sample_streaming(partial_rollout_sample), - name=f"process_{partial_rollout_sample.sample_id}", + self._process_single_sample_streaming(rollout_sample), + name=rollout_sample.sample_id, ) self.active_tasks.add(task) # 标记队列任务完成 self.pending_queue.task_done() - async def _process_single_sample_streaming(self, partial_rollout_sample): + async def _process_single_sample_streaming(self, rollout_sample: RolloutSample): """流式处理单个样本""" # 调用异步生成方法 agent_loop_output, processing_time = await self.async_rollout_manager.generate_single_sample_async( - partial_rollout_sample.full_batch, partial_rollout_sample.sample_id + rollout_sample.full_batch, rollout_sample.agent_loop_output ) # 直接更新 RolloutSample 对象,填充剩余字段 - partial_rollout_sample.agent_loop_output = agent_loop_output - partial_rollout_sample.processing_time = processing_time - partial_rollout_sample.generation_timestamp = time.time() - partial_rollout_sample.param_version = self.current_param_version + rollout_sample.agent_loop_output = agent_loop_output + rollout_sample.processing_time += processing_time + rollout_sample.param_version = self.current_param_version - # 直接放入结果队列 - await self.result_queue.put(partial_rollout_sample) + print(f"[FullyAsyncRollouter] rollout {rollout_sample.sample_id} cost {processing_time:.2f}s") + + if agent_loop_output.is_cancel: + # 放入 cancel 队列中,等待恢复生成 + await self.cancel_queue.put(rollout_sample) + else: + # 否则放入结果队列 + await self.result_queue.put(rollout_sample) self.processed_sample_count += 1 # 更新最大处理时间统计 if processing_time > self.max_processing_time: self.max_processing_time = processing_time - # print(f"[FullyAsyncRollouter] rollout {partial_rollout_sample.sample_id} cost {processing_time:.2f}s") - async def _consumer_worker(self): """消费者协程,负责从结果队列获取处理结果并放入消息队列""" while True: @@ -575,6 +581,8 @@ async def pause(self): print("[FullyAsyncRollouter][Public] pause") async with self.lock: self.paused = True + # 取消rollout所有任务 + # await self.async_rollout_manager.cancel() if self.active_tasks: print("[FullyAsyncRollouter][Public][Pause]") await asyncio.gather(*self.active_tasks, return_exceptions=True) @@ -586,7 +594,6 @@ async def resume(self): """resume rollout TODO integrated Partial Rollout """ - print("[FullyAsyncRollouter][Public] resume") async with self.lock: self.paused = False self.condition.notify_all() @@ -609,6 +616,7 @@ async def get_statistics(self) -> dict: "max_required_samples": self.max_required_samples, "required_samples": self.required_samples, "staleness_threshold": self.staleness_threshold, + "cancel_queue_size": self.cancel_queue.qsize(), } return stats diff --git a/verl/experimental/agent_loop/__init__.py b/verl/experimental/agent_loop/__init__.py index a39171db764..0d131dd1d3a 100644 --- a/verl/experimental/agent_loop/__init__.py +++ b/verl/experimental/agent_loop/__init__.py @@ -14,8 +14,9 @@ from .agent_loop import AgentLoopBase, AgentLoopManager from .single_turn_agent_loop import SingleTurnAgentLoop +from .partial_single_turn_agent_loop import PartialSingleTurnAgentLoop from .tool_agent_loop import ToolAgentLoop -_ = [SingleTurnAgentLoop, ToolAgentLoop] +_ = [SingleTurnAgentLoop, ToolAgentLoop, PartialSingleTurnAgentLoop] __all__ = ["AgentLoopBase", "AgentLoopManager"] diff --git a/verl/experimental/agent_loop/agent_loop.py b/verl/experimental/agent_loop/agent_loop.py index 29f2b30edb7..dcb7184df5d 100644 --- a/verl/experimental/agent_loop/agent_loop.py +++ b/verl/experimental/agent_loop/agent_loop.py @@ -18,7 +18,7 @@ import random import time from abc import ABC, abstractmethod -from typing import Any +from typing import Any, Optional import hydra import numpy as np @@ -104,6 +104,15 @@ async def generate( ) return output + async def generate_for_partial(self, request_id, prompt_ids, sampling_params): + server = self._choose_server(request_id) + output = await server.generate_for_partial.remote( + request_id=request_id, + prompt_ids=prompt_ids, + sampling_params=sampling_params, + ) + return output + class AgentLoopMetrics(BaseModel): """Agent loop performance metrics.""" @@ -125,6 +134,8 @@ class AgentLoopOutput(BaseModel): """Number of chat turns, including user, assistant, tool.""" metrics: AgentLoopMetrics """Auxiliary performance metrics""" + is_cancel: bool = False + """Indicates whether the request was interrupted""" # make hydra.utils.instantiate happy @@ -169,12 +180,15 @@ def init_class(cls, config: DictConfig, tokenizer: AutoTokenizer, **kwargs): cls._class_initialized = True @abstractmethod - async def run(self, messages: list[dict[str, Any]], sampling_params: dict[str, Any]) -> AgentLoopOutput: + async def run( + self, messages: list[dict[str, Any]], sampling_params: dict[str, Any], partial_output: Optional[AgentLoopOutput] + ) -> AgentLoopOutput: """Run agent loop to interact with LLM server and environment. Args: messages (List[Dict[str, Any]]): Input messages. sampling_params (Dict[str, Any]): LLM sampling params. + partial_output: Optional[AgentLoopOutput]: already rollout result. Returns: AgentLoopOutput: Agent loop output. @@ -368,11 +382,14 @@ async def generate_sequences(self, batch: DataProto) -> DataProto: output = postprocess_agent_loop_outputs(outputs, self.tokenizer, self.config) return output - async def generate_sequences_no_post(self, batch: DataProto) -> list[AgentLoopOutput]: + async def generate_sequences_no_post( + self, batch: DataProto, partial_output: Optional[AgentLoopOutput] + ) -> list[AgentLoopOutput]: """Generate sequences from agent loop. Args: batch (DataProto): Input batch. + partial_output: Optional[AgentLoopOutput]: already rollout result. Returns: list[AgentLoopOutput]: List of agent loop outputs, one per sample in the batch. @@ -413,7 +430,9 @@ async def generate_sequences_no_post(self, batch: DataProto) -> list[AgentLoopOu for agent_name, messages, trajectory in zip(agent_names, raw_prompts, trajectory_info, strict=True): tasks.append( - asyncio.create_task(self._run_agent_loop(agent_name, messages.tolist(), sampling_params, trajectory)) + asyncio.create_task( + self._run_agent_loop(agent_name, messages.tolist(), sampling_params, trajectory, partial_output) + ) ) outputs = await asyncio.gather(*tasks) @@ -425,6 +444,7 @@ async def _run_agent_loop( messages: list[dict[str, Any]], sampling_params: dict[str, Any], trajectory: dict[str, Any], + partial_output: Optional[AgentLoopOutput], ) -> AgentLoopOutput: with rollout_trace_attr( step=trajectory["step"], @@ -444,7 +464,7 @@ async def _run_agent_loop( server_manager=self.server_manager, tokenizer=self.tokenizer, ) - output = await agent_loop.run(messages, sampling_params) + output = await agent_loop.run(messages, sampling_params, partial_output) return output @@ -582,13 +602,15 @@ def generate_sequences(self, prompts: DataProto) -> DataProto: output.meta_info = {"timing": timing} return output - async def generate_single_sample_async(self, sample: DataProto, sample_id: str) -> tuple[AgentLoopOutput, float]: + async def generate_single_sample_async( + self, sample: DataProto, partial_output: Optional[AgentLoopOutput] + ) -> tuple[AgentLoopOutput, float]: """ 异步处理单个样本 - 用于流式推理的核心方法 Args: sample: 单个样本数据 - sample_id: 样本ID + partial_output: Optional[AgentLoopOutput]: already rollout result. Returns: tuple[AgentLoopOutput, float]: 处理结果和处理时间 @@ -599,7 +621,7 @@ async def generate_single_sample_async(self, sample: DataProto, sample_id: str) worker = self._select_best_worker() # 异步处理单个样本 - 使用无后处理版本获取原始AgentLoopOutput - output_future = worker.generate_sequences_no_post.remote(sample) + output_future = worker.generate_sequences_no_post.remote(sample, partial_output) outputs = await asyncio.wrap_future(output_future.future()) processing_time = time.time() - start_time diff --git a/verl/experimental/agent_loop/partial_single_turn_agent_loop.py b/verl/experimental/agent_loop/partial_single_turn_agent_loop.py new file mode 100644 index 00000000000..fd2a7292e67 --- /dev/null +++ b/verl/experimental/agent_loop/partial_single_turn_agent_loop.py @@ -0,0 +1,68 @@ +# Copyright 2024 Bytedance Ltd. and/or its affiliates +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import logging +import os +from typing import Any, Optional +from uuid import uuid4 + +from verl.experimental.agent_loop.agent_loop import AgentLoopBase, AgentLoopOutput, register +from verl.utils.profiler import simple_timer + +logger = logging.getLogger(__file__) +logger.setLevel(os.getenv("VERL_LOGGING_LEVEL", "WARN")) + + +@register("partial_single_turn_agent") +class PartialSingleTurnAgentLoop(AgentLoopBase): + """Naive agent loop that only do single turn chat completion.""" + + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + self.prompt_length = self.config.actor_rollout_ref.rollout.prompt_length + self.response_length = self.config.actor_rollout_ref.rollout.response_length + + async def run( + self, messages: list[dict[str, Any]], sampling_params: dict[str, Any], output: Optional[AgentLoopOutput] + ) -> AgentLoopOutput: + if not output: + prompt_ids = await self.loop.run_in_executor( + None, lambda: self.tokenizer.apply_chat_template(messages, add_generation_prompt=True, tokenize=True) + ) + else: + # 恢复暂停的样本,结果直接添加到 prompt_ids 后面 + prompt_ids = output.prompt_ids + output.response_ids + + metrics = {} + request_id = uuid4().hex + with simple_timer("generate_sequences", metrics): + response_ids, is_cancel = await self.server_manager.generate_for_partial( + request_id=request_id, prompt_ids=prompt_ids, sampling_params=sampling_params + ) + + if not output: + response_mask = [1] * len(response_ids) + # 暂停待恢复样本, 把输出结果加到 response_ids 后,并重置 response_mask + else: + prompt_ids = output.prompt_ids + response_ids = output.response_ids + response_ids + response_mask = [1] * len(response_ids) + + return AgentLoopOutput( + prompt_ids=prompt_ids, + response_ids=response_ids[: self.response_length], + response_mask=response_mask[: self.response_length], + num_turns=2, + metrics=metrics, + is_cancel=is_cancel, + ) diff --git a/verl/workers/rollout/vllm_rollout/vllm_async_server.py b/verl/workers/rollout/vllm_rollout/vllm_async_server.py index a5cc0b83e59..3c238912cca 100644 --- a/verl/workers/rollout/vllm_rollout/vllm_async_server.py +++ b/verl/workers/rollout/vllm_rollout/vllm_async_server.py @@ -11,17 +11,20 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. +import asyncio import logging import os import pickle -from typing import Any, Callable, Optional +from contextlib import ExitStack +from typing import Any, Callable, Optional, Coroutine, Sequence import ray import zmq -from omegaconf import DictConfig +from omegaconf import DictConfig, ListConfig from starlette.requests import Request from starlette.responses import JSONResponse, StreamingResponse -from vllm import SamplingParams +from vllm import SamplingParams, RequestOutput +from vllm.config import CompilationConfig, CompilationLevel from vllm.engine.arg_utils import AsyncEngineArgs from vllm.entrypoints.logger import RequestLogger from vllm.entrypoints.openai.protocol import ChatCompletionRequest, ChatCompletionResponse, ErrorResponse @@ -204,6 +207,9 @@ def __init__(self, config: DictConfig, vllm_dp_size: int, vllm_dp_rank: int, wg_ self.vllm_dp_rank = vllm_dp_rank self.wg_prefix = wg_prefix self.engine: AsyncLLM = None + # for cancel + self.cancel_event: dict[str, asyncio.Event] = {} + self.req_output: dict[str, Optional[RequestOutput]] = {} async def init_engine(self): """Init vLLM AsyncLLM engine.""" @@ -326,6 +332,46 @@ async def generate(self, prompt_ids: list[int], sampling_params: dict[str, Any], return final_res.outputs[0].token_ids + async def _generate_step(self, prompt_ids: list[int], sampling_params: dict[str, Any], request_id: str): + max_tokens = self.max_model_len - len(prompt_ids) + sampling_params = SamplingParams(max_tokens=max_tokens, **sampling_params) + prompt = TokensPrompt(prompt_token_ids=prompt_ids) + generator = self.engine.generate(prompt=prompt, sampling_params=sampling_params, request_id=request_id) + + # Get final response + self.req_output[request_id]: Optional[RequestOutput] = None + async for output in generator: + self.req_output[request_id] = output + assert self.req_output[request_id] is not None + + async def generate_for_partial( + self, prompt_ids: list[int], sampling_params: dict[str, Any], request_id: str + ) -> tuple[Sequence[int], bool] | tuple[str, bool]: + with ExitStack() as stack: + stack.callback(lambda: self.cancel_event.pop(request_id, None)) + stack.callback(lambda: self.req_output.pop(request_id, None)) + + self.cancel_event[request_id] = asyncio.Event() + cancel_handle = asyncio.create_task(self.cancel_event[request_id].wait()) + generation_handle = asyncio.create_task(self._generate_step(prompt_ids, sampling_params, request_id)) + + done, pend = await asyncio.wait([generation_handle, cancel_handle], return_when=asyncio.FIRST_COMPLETED) + + for task in done: + await task + + for task in pend: + task.cancel() + + token_ids = self.req_output[request_id].outputs[0].token_ids + is_cancel = generation_handle not in done + return token_ids, is_cancel + + async def cancel(self): + for request_id in self.cancel_event: + self.cancel_event[request_id].set() + print(f"[ExternalRayDistributedExecutor] cancel request_id {request_id}") + async def wake_up(self): if self.config.rollout.free_cache_engine: await self.engine.wake_up() From f5364bedbb918957dd601959038f235589569dcc Mon Sep 17 00:00:00 2001 From: ArronHZG Date: Wed, 20 Aug 2025 16:38:25 +0800 Subject: [PATCH 068/182] partial rollout cancel --- .../config/fully_async_ppo_trainer.yaml | 2 +- .../fully_async_rollouter.py | 10 ++---- verl/experimental/agent_loop/agent_loop.py | 5 +++ .../rollout/vllm_rollout/vllm_async_server.py | 32 +++++++++++-------- 4 files changed, 27 insertions(+), 22 deletions(-) diff --git a/recipe/fully_async_policy/config/fully_async_ppo_trainer.yaml b/recipe/fully_async_policy/config/fully_async_ppo_trainer.yaml index f1c4a1c602f..e33ebdb4408 100644 --- a/recipe/fully_async_policy/config/fully_async_ppo_trainer.yaml +++ b/recipe/fully_async_policy/config/fully_async_ppo_trainer.yaml @@ -11,7 +11,7 @@ defaults: async_training: # 新鲜度控制 (Freshness Control) staleness_threshold: 3 # 样本新鲜度阈值 - trigger_parameter_sync_step: 32 # >=1 train 每次训练一个batch, 迭代多少次后触发更新 + trigger_parameter_sync_step: 1 # >=1 train 每次训练一个batch, 迭代多少次后触发更新 # Rollout配置 rollout: diff --git a/recipe/fully_async_policy/fully_async_rollouter.py b/recipe/fully_async_policy/fully_async_rollouter.py index 2500a215d9a..ce616be95de 100644 --- a/recipe/fully_async_policy/fully_async_rollouter.py +++ b/recipe/fully_async_policy/fully_async_rollouter.py @@ -528,7 +528,7 @@ async def _async_monitor_loop(self): current_time = time.time() if current_time - last_stats_time >= stats_interval: stats = await self.get_statistics() - print(f"[FullyAsyncRollouter][MonitorLoop] {stats}") + pprint(stats) last_stats_time = current_time # pause 和 resume 直接,不进行恢复操作 @@ -578,22 +578,18 @@ async def pause(self): """pause rollout TODO integrated Partial Rollout """ - print("[FullyAsyncRollouter][Public] pause") + print("[FullyAsyncRollouter][Public][Pause]") async with self.lock: self.paused = True # 取消rollout所有任务 - # await self.async_rollout_manager.cancel() + self.async_rollout_manager.cancel() if self.active_tasks: - print("[FullyAsyncRollouter][Public][Pause]") await asyncio.gather(*self.active_tasks, return_exceptions=True) self.active_tasks.clear() print("[FullyAsyncRollouter][Public][Pause] All active tasks completed") self.monitor_loop_trigger = False async def resume(self): - """resume rollout - TODO integrated Partial Rollout - """ async with self.lock: self.paused = False self.condition.notify_all() diff --git a/verl/experimental/agent_loop/agent_loop.py b/verl/experimental/agent_loop/agent_loop.py index dcb7184df5d..668a7cc0d62 100644 --- a/verl/experimental/agent_loop/agent_loop.py +++ b/verl/experimental/agent_loop/agent_loop.py @@ -673,3 +673,8 @@ def wake_up(self): def sleep(self): """Sleep all rollout server instances.""" ray.get([server.sleep.remote() for server in self.async_llm_servers]) + + def cancel(self): + """Cancel all rollout tasks.""" + ray.get([server.cancel.remote() for server in self.async_llm_servers]) + diff --git a/verl/workers/rollout/vllm_rollout/vllm_async_server.py b/verl/workers/rollout/vllm_rollout/vllm_async_server.py index 3c238912cca..2f62a9bd6b7 100644 --- a/verl/workers/rollout/vllm_rollout/vllm_async_server.py +++ b/verl/workers/rollout/vllm_rollout/vllm_async_server.py @@ -208,6 +208,8 @@ def __init__(self, config: DictConfig, vllm_dp_size: int, vllm_dp_rank: int, wg_ self.wg_prefix = wg_prefix self.engine: AsyncLLM = None # for cancel + + self.lock = asyncio.Lock() self.cancel_event: dict[str, asyncio.Event] = {} self.req_output: dict[str, Optional[RequestOutput]] = {} @@ -347,30 +349,32 @@ async def _generate_step(self, prompt_ids: list[int], sampling_params: dict[str, async def generate_for_partial( self, prompt_ids: list[int], sampling_params: dict[str, Any], request_id: str ) -> tuple[Sequence[int], bool] | tuple[str, bool]: - with ExitStack() as stack: - stack.callback(lambda: self.cancel_event.pop(request_id, None)) - stack.callback(lambda: self.req_output.pop(request_id, None)) - + # 设置中断标志 + async with self.lock: self.cancel_event[request_id] = asyncio.Event() cancel_handle = asyncio.create_task(self.cancel_event[request_id].wait()) - generation_handle = asyncio.create_task(self._generate_step(prompt_ids, sampling_params, request_id)) - done, pend = await asyncio.wait([generation_handle, cancel_handle], return_when=asyncio.FIRST_COMPLETED) + generation_handle = asyncio.create_task(self._generate_step(prompt_ids, sampling_params, request_id)) + done, pend = await asyncio.wait([generation_handle, cancel_handle], return_when=asyncio.FIRST_COMPLETED) - for task in done: - await task + for task in done: + await task - for task in pend: - task.cancel() + for task in pend: + task.cancel() + async with self.lock: token_ids = self.req_output[request_id].outputs[0].token_ids is_cancel = generation_handle not in done - return token_ids, is_cancel + self.cancel_event.pop(request_id, None) + self.req_output.pop(request_id, None) + return token_ids, is_cancel async def cancel(self): - for request_id in self.cancel_event: - self.cancel_event[request_id].set() - print(f"[ExternalRayDistributedExecutor] cancel request_id {request_id}") + async with self.lock: + for request_id in self.cancel_event: + self.cancel_event[request_id].set() + print(f"[ExternalRayDistributedExecutor] cancel request_id {request_id}") async def wake_up(self): if self.config.rollout.free_cache_engine: From f547a22a65ee0b3ac00f51949b3145fcd50037f3 Mon Sep 17 00:00:00 2001 From: ArronHZG Date: Wed, 20 Aug 2025 17:24:28 +0800 Subject: [PATCH 069/182] partial rollout cancel debug --- .../config/fully_async_ppo_trainer.yaml | 1 + recipe/fully_async_policy/fully_async_rollouter.py | 11 ++++++++--- verl/experimental/agent_loop/agent_loop.py | 8 ++++---- 3 files changed, 13 insertions(+), 7 deletions(-) diff --git a/recipe/fully_async_policy/config/fully_async_ppo_trainer.yaml b/recipe/fully_async_policy/config/fully_async_ppo_trainer.yaml index e33ebdb4408..30f5ec4bf87 100644 --- a/recipe/fully_async_policy/config/fully_async_ppo_trainer.yaml +++ b/recipe/fully_async_policy/config/fully_async_ppo_trainer.yaml @@ -12,6 +12,7 @@ async_training: # 新鲜度控制 (Freshness Control) staleness_threshold: 3 # 样本新鲜度阈值 trigger_parameter_sync_step: 1 # >=1 train 每次训练一个batch, 迭代多少次后触发更新 + partial_rollout: True # 同步参数时,是否中断 rollout # Rollout配置 rollout: diff --git a/recipe/fully_async_policy/fully_async_rollouter.py b/recipe/fully_async_policy/fully_async_rollouter.py index ce616be95de..8a69536fd58 100644 --- a/recipe/fully_async_policy/fully_async_rollouter.py +++ b/recipe/fully_async_policy/fully_async_rollouter.py @@ -312,6 +312,7 @@ async def _processor_worker(self): self.paused = True while self.paused: await self.condition.wait() + print("等待已提交的任务结束 condition") # 获取待处理的部分 RolloutSample async with self.lock: @@ -339,6 +340,7 @@ async def _processor_worker(self): # pause结束后,获取到锁,还需要判断是否是暂停阶段,否则继续等待 while self.paused: await self.condition.wait() + print("立即提交单个样本处理 condition") task = asyncio.create_task( self._process_single_sample_streaming(rollout_sample), name=rollout_sample.sample_id, @@ -528,14 +530,15 @@ async def _async_monitor_loop(self): current_time = time.time() if current_time - last_stats_time >= stats_interval: stats = await self.get_statistics() + print("[FullyAsyncRollouter][MonitorLoop][Statistics]") pprint(stats) last_stats_time = current_time # pause 和 resume 直接,不进行恢复操作 if self.monitor_loop_trigger and self.paused: - if await self._should_pause_generation(): + if not await self._should_pause_generation(): async with self.lock: - print("[FullyAsyncRollouter][MonitorLoop] trigger resume") + print("[FullyAsyncRollouter][MonitorLoop][Resume]") self.paused = False self.condition.notify_all() @@ -582,7 +585,8 @@ async def pause(self): async with self.lock: self.paused = True # 取消rollout所有任务 - self.async_rollout_manager.cancel() + if self.config.async_training.partial_rollout: + await self.async_rollout_manager.cancel_async() if self.active_tasks: await asyncio.gather(*self.active_tasks, return_exceptions=True) self.active_tasks.clear() @@ -590,6 +594,7 @@ async def pause(self): self.monitor_loop_trigger = False async def resume(self): + print("[FullyAsyncRollouter][Public][Resume]") async with self.lock: self.paused = False self.condition.notify_all() diff --git a/verl/experimental/agent_loop/agent_loop.py b/verl/experimental/agent_loop/agent_loop.py index 668a7cc0d62..6bd90fe9b44 100644 --- a/verl/experimental/agent_loop/agent_loop.py +++ b/verl/experimental/agent_loop/agent_loop.py @@ -674,7 +674,7 @@ def sleep(self): """Sleep all rollout server instances.""" ray.get([server.sleep.remote() for server in self.async_llm_servers]) - def cancel(self): - """Cancel all rollout tasks.""" - ray.get([server.cancel.remote() for server in self.async_llm_servers]) - + async def cancel_async(self): + """Cancel all rollout tasks asynchronously.""" + futures = [server.cancel.remote() for server in self.async_llm_servers] + await asyncio.gather(*[asyncio.wrap_future(future.future()) for future in futures], return_exceptions=True) From a3e11f9441484de2c86cc90b6897602ca1ba89e9 Mon Sep 17 00:00:00 2001 From: ArronHZG Date: Wed, 20 Aug 2025 19:14:53 +0800 Subject: [PATCH 070/182] partial rollout cancel success --- .../fully_async_rollouter.py | 20 +++++++++++-------- .../rollout/vllm_rollout/vllm_async_server.py | 1 - 2 files changed, 12 insertions(+), 9 deletions(-) diff --git a/recipe/fully_async_policy/fully_async_rollouter.py b/recipe/fully_async_policy/fully_async_rollouter.py index 8a69536fd58..1d1490b32ce 100644 --- a/recipe/fully_async_policy/fully_async_rollouter.py +++ b/recipe/fully_async_policy/fully_async_rollouter.py @@ -296,23 +296,22 @@ async def _processor_worker(self): """流式处理工作协程 - 逐个样本立即提交处理,不等待批次""" while True: + simple_from_cancel_queue = False if not self.cancel_queue.empty(): - print(f"self.cancel_queue {self.cancel_queue.qsize()}") rollout_sample = await self.cancel_queue.get() + simple_from_cancel_queue = True else: rollout_sample = await self.pending_queue.get() - self.staleness_samples += 1 + self.staleness_samples += 1 async with self.lock: if await self._should_pause_generation(): - print("[FullyAsyncRollouter][Processor] 等待已提交的任务结束") if self.active_tasks: await asyncio.gather(*self.active_tasks, return_exceptions=True) self.active_tasks.clear() self.paused = True while self.paused: await self.condition.wait() - print("等待已提交的任务结束 condition") # 获取待处理的部分 RolloutSample async with self.lock: @@ -340,7 +339,6 @@ async def _processor_worker(self): # pause结束后,获取到锁,还需要判断是否是暂停阶段,否则继续等待 while self.paused: await self.condition.wait() - print("立即提交单个样本处理 condition") task = asyncio.create_task( self._process_single_sample_streaming(rollout_sample), name=rollout_sample.sample_id, @@ -348,7 +346,10 @@ async def _processor_worker(self): self.active_tasks.add(task) # 标记队列任务完成 - self.pending_queue.task_done() + if simple_from_cancel_queue: + self.cancel_queue.task_done() + else: + self.pending_queue.task_done() async def _process_single_sample_streaming(self, rollout_sample: RolloutSample): """流式处理单个样本""" @@ -362,7 +363,10 @@ async def _process_single_sample_streaming(self, rollout_sample: RolloutSample): rollout_sample.processing_time += processing_time rollout_sample.param_version = self.current_param_version - print(f"[FullyAsyncRollouter] rollout {rollout_sample.sample_id} cost {processing_time:.2f}s") + # print( + # f"[FullyAsyncRollouter] rollout {rollout_sample.sample_id} " + # f"cost {processing_time:.2f}s " + # f"response_len: {len(rollout_sample.agent_loop_output.response_ids)}") if agent_loop_output.is_cancel: # 放入 cancel 队列中,等待恢复生成 @@ -579,7 +583,7 @@ async def _should_pause_generation(self) -> bool: async def pause(self): """pause rollout - TODO integrated Partial Rollout + TODO async_rollout_manager clear kv cache """ print("[FullyAsyncRollouter][Public][Pause]") async with self.lock: diff --git a/verl/workers/rollout/vllm_rollout/vllm_async_server.py b/verl/workers/rollout/vllm_rollout/vllm_async_server.py index 2f62a9bd6b7..06e8626ad42 100644 --- a/verl/workers/rollout/vllm_rollout/vllm_async_server.py +++ b/verl/workers/rollout/vllm_rollout/vllm_async_server.py @@ -374,7 +374,6 @@ async def cancel(self): async with self.lock: for request_id in self.cancel_event: self.cancel_event[request_id].set() - print(f"[ExternalRayDistributedExecutor] cancel request_id {request_id}") async def wake_up(self): if self.config.rollout.free_cache_engine: From 1fdd90d460fbac9956bd28265b9e2df1fa7ac0d7 Mon Sep 17 00:00:00 2001 From: ArronHZG Date: Thu, 21 Aug 2025 20:42:40 +0800 Subject: [PATCH 071/182] partial rollout cancel debug --- .../config/fully_async_ppo_trainer.yaml | 6 +- .../dapo_7b_math_fsdp2_4_12.sh | 8 +- .../fully_async_rollouter.py | 80 +++++++++++-------- verl/experimental/agent_loop/agent_loop.py | 5 ++ .../rollout/vllm_rollout/vllm_async_server.py | 11 ++- 5 files changed, 71 insertions(+), 39 deletions(-) diff --git a/recipe/fully_async_policy/config/fully_async_ppo_trainer.yaml b/recipe/fully_async_policy/config/fully_async_ppo_trainer.yaml index 30f5ec4bf87..0714e107ee4 100644 --- a/recipe/fully_async_policy/config/fully_async_ppo_trainer.yaml +++ b/recipe/fully_async_policy/config/fully_async_ppo_trainer.yaml @@ -10,9 +10,9 @@ defaults: async_training: # 新鲜度控制 (Freshness Control) - staleness_threshold: 3 # 样本新鲜度阈值 - trigger_parameter_sync_step: 1 # >=1 train 每次训练一个batch, 迭代多少次后触发更新 - partial_rollout: True # 同步参数时,是否中断 rollout + staleness_threshold: 1 # 样本新鲜度阈值 + trigger_parameter_sync_step: 4 # >=1 train 每次训练一个batch, 迭代多少次后触发更新 + partial_rollout: True # 同步参数时,是否中断 rollout # Rollout配置 rollout: diff --git a/recipe/fully_async_policy/dapo_7b_math_fsdp2_4_12.sh b/recipe/fully_async_policy/dapo_7b_math_fsdp2_4_12.sh index 86cd25affe2..fe490af24ea 100644 --- a/recipe/fully_async_policy/dapo_7b_math_fsdp2_4_12.sh +++ b/recipe/fully_async_policy/dapo_7b_math_fsdp2_4_12.sh @@ -76,9 +76,10 @@ train_prompt_bsz=0 gen_prompt_bsz=1 n_resp_per_prompt=16 train_prompt_mini_bsz=32 -staleness_threshold=10 +staleness_threshold=1 total_rollout_steps=$(((512*16*100))) -trigger_parameter_sync_step=32 +trigger_parameter_sync_step=4 +partial_rollout=True /home/hadoop-djst-algoplat/miniconda3/bin/python -m recipe.fully_async_policy.fully_async_main \ data.train_files="${TRAIN_FILE}" \ @@ -159,4 +160,5 @@ trigger_parameter_sync_step=32 rollout.total_rollout_steps="${total_rollout_steps}" \ rollout.total_epochs=10 \ async_training.staleness_threshold="${staleness_threshold}" \ - async_training.trigger_parameter_sync_step="${trigger_parameter_sync_step}" + async_training.trigger_parameter_sync_step="${trigger_parameter_sync_step}" \ + async_training.partial_rollout="${partial_rollout}" diff --git a/recipe/fully_async_policy/fully_async_rollouter.py b/recipe/fully_async_policy/fully_async_rollouter.py index 1d1490b32ce..0982954db75 100644 --- a/recipe/fully_async_policy/fully_async_rollouter.py +++ b/recipe/fully_async_policy/fully_async_rollouter.py @@ -62,6 +62,8 @@ def __init__( assert not self.hybrid_engine assert self.config.data.train_batch_size == 0, "train_batch_size must be zero" assert self.config.data.gen_batch_size == 1, "gen_batch_size must be one" + assert self.config.async_training.staleness_threshold >= 0, "staleness_threshold must larger than 0" + assert self.config.async_training.trigger_parameter_sync_step >= 1, "trigger_parameter_sync_step must larger than 1" self.role_worker_mapping = role_worker_mapping self.resource_pool_manager = resource_pool_manager @@ -304,33 +306,46 @@ async def _processor_worker(self): rollout_sample = await self.pending_queue.get() self.staleness_samples += 1 - async with self.lock: - if await self._should_pause_generation(): - if self.active_tasks: - await asyncio.gather(*self.active_tasks, return_exceptions=True) - self.active_tasks.clear() + # 判断是否需要暂停 + # self.paused 由 pause() 和 self._should_pause_generation() 负责修改 + if self.paused or await self._should_pause_generation(): + print("[FullyAsyncRollouter][Processor] 收到暂停信号,等待剩余任务完成...") + while self.active_tasks: + async with self.lock: + # 获取锁后,active_tasks 数量会发生变化,需要再次校验 + if self.active_tasks: + done_tasks, self.active_tasks = await asyncio.wait( + self.active_tasks, return_when=asyncio.FIRST_COMPLETED + ) + for task in done_tasks: + await task + async with self.lock: self.paused = True - while self.paused: - await self.condition.wait() + + async with self.lock: + while self.paused: + await self.condition.wait() # 获取待处理的部分 RolloutSample - async with self.lock: - if rollout_sample == "DONE": - print("[FullyAsyncRollouter][Processor] 收到结束信号,等待剩余任务完成...") - # 等待所有活动任务完成 - if self.active_tasks: - await asyncio.gather(*self.active_tasks, return_exceptions=True) - self.active_tasks.clear() - break + if rollout_sample == "DONE": + print("[FullyAsyncRollouter][Processor] 收到结束信号,等待剩余任务完成...") + while self.active_tasks: + async with self.lock: + if self.active_tasks: + done_tasks, self.active_tasks = await asyncio.wait( + self.active_tasks, return_when=asyncio.FIRST_COMPLETED + ) + for task in done_tasks: + await task + break # 检查并发数是否超限 - async with self.lock: - while len(self.active_tasks) >= self.max_concurrent_samples: - # 等待至少一个任务完成 - done_tasks, self.active_tasks = await asyncio.wait( - self.active_tasks, return_when=asyncio.FIRST_COMPLETED - ) - # 清理已完成的任务 + while len(self.active_tasks) >= self.max_concurrent_samples: + async with self.lock: + if self.active_tasks: + done_tasks, self.active_tasks = await asyncio.wait( + self.active_tasks, return_when=asyncio.FIRST_COMPLETED + ) for task in done_tasks: await task @@ -353,7 +368,6 @@ async def _processor_worker(self): async def _process_single_sample_streaming(self, rollout_sample: RolloutSample): """流式处理单个样本""" - # 调用异步生成方法 agent_loop_output, processing_time = await self.async_rollout_manager.generate_single_sample_async( rollout_sample.full_batch, rollout_sample.agent_loop_output @@ -366,7 +380,9 @@ async def _process_single_sample_streaming(self, rollout_sample: RolloutSample): # print( # f"[FullyAsyncRollouter] rollout {rollout_sample.sample_id} " # f"cost {processing_time:.2f}s " - # f"response_len: {len(rollout_sample.agent_loop_output.response_ids)}") + # f"len: {len(rollout_sample.agent_loop_output.response_ids)} " + # f"cancel: {agent_loop_output.is_cancel} " + # ) if agent_loop_output.is_cancel: # 放入 cancel 队列中,等待恢复生成 @@ -538,17 +554,14 @@ async def _async_monitor_loop(self): pprint(stats) last_stats_time = current_time - # pause 和 resume 直接,不进行恢复操作 - if self.monitor_loop_trigger and self.paused: + # pause 和 resume 之间,不进行恢复操作 + if self.monitor_loop_trigger: if not await self._should_pause_generation(): async with self.lock: - print("[FullyAsyncRollouter][MonitorLoop][Resume]") self.paused = False self.condition.notify_all() async def _should_pause_generation(self) -> bool: - if self.paused: - return True """Determine whether the build should be paused""" queue_stats = self.message_queue_client.get_statistics_sync() queue_size = queue_stats["queue_size"] @@ -594,15 +607,18 @@ async def pause(self): if self.active_tasks: await asyncio.gather(*self.active_tasks, return_exceptions=True) self.active_tasks.clear() - print("[FullyAsyncRollouter][Public][Pause] All active tasks completed") - self.monitor_loop_trigger = False + print("[FullyAsyncRollouter][Public][Pause] All active tasks completed") + self.monitor_loop_trigger = False async def resume(self): print("[FullyAsyncRollouter][Public][Resume]") async with self.lock: self.paused = False + self.monitor_loop_trigger = True self.condition.notify_all() - self.monitor_loop_trigger = True + + if self.config.async_training.partial_rollout: + await self.async_rollout_manager.resume_async() async def get_statistics(self) -> dict: queue_stats = self.message_queue_client.get_statistics_sync() diff --git a/verl/experimental/agent_loop/agent_loop.py b/verl/experimental/agent_loop/agent_loop.py index 6bd90fe9b44..83ba95c8662 100644 --- a/verl/experimental/agent_loop/agent_loop.py +++ b/verl/experimental/agent_loop/agent_loop.py @@ -678,3 +678,8 @@ async def cancel_async(self): """Cancel all rollout tasks asynchronously.""" futures = [server.cancel.remote() for server in self.async_llm_servers] await asyncio.gather(*[asyncio.wrap_future(future.future()) for future in futures], return_exceptions=True) + + async def resume_async(self): + """Cancel all rollout tasks asynchronously.""" + futures = [server.resume.remote() for server in self.async_llm_servers] + await asyncio.gather(*[asyncio.wrap_future(future.future()) for future in futures], return_exceptions=True) diff --git a/verl/workers/rollout/vllm_rollout/vllm_async_server.py b/verl/workers/rollout/vllm_rollout/vllm_async_server.py index 06e8626ad42..7ce640e33cb 100644 --- a/verl/workers/rollout/vllm_rollout/vllm_async_server.py +++ b/verl/workers/rollout/vllm_rollout/vllm_async_server.py @@ -209,6 +209,7 @@ def __init__(self, config: DictConfig, vllm_dp_size: int, vllm_dp_rank: int, wg_ self.engine: AsyncLLM = None # for cancel + self.paused = False self.lock = asyncio.Lock() self.cancel_event: dict[str, asyncio.Event] = {} self.req_output: dict[str, Optional[RequestOutput]] = {} @@ -351,10 +352,13 @@ async def generate_for_partial( ) -> tuple[Sequence[int], bool] | tuple[str, bool]: # 设置中断标志 async with self.lock: + if self.paused: + # cancel 后, 所有任务直接返回,等待下次提交 + return [], True self.cancel_event[request_id] = asyncio.Event() cancel_handle = asyncio.create_task(self.cancel_event[request_id].wait()) + generation_handle = asyncio.create_task(self._generate_step(prompt_ids, sampling_params, request_id)) - generation_handle = asyncio.create_task(self._generate_step(prompt_ids, sampling_params, request_id)) done, pend = await asyncio.wait([generation_handle, cancel_handle], return_when=asyncio.FIRST_COMPLETED) for task in done: @@ -372,9 +376,14 @@ async def generate_for_partial( async def cancel(self): async with self.lock: + self.paused = True for request_id in self.cancel_event: self.cancel_event[request_id].set() + async def resume(self): + async with self.lock: + self.paused = False + async def wake_up(self): if self.config.rollout.free_cache_engine: await self.engine.wake_up() From ea0020569bb67cb00b6f3cec355a72d8a344f419 Mon Sep 17 00:00:00 2001 From: ArronHZG Date: Thu, 21 Aug 2025 21:04:46 +0800 Subject: [PATCH 072/182] partial rollout banchmark time --- ...sdp2_4_12.sh => dapo_7b_math_fsdp2_4_4.sh} | 6 +- .../dapo_7b_math_fsdp2_colocate.sh | 136 ++++++++++++++++++ recipe/fully_async_policy/fully_async_main.py | 3 + verl/trainer/main_ppo.py | 3 + 4 files changed, 145 insertions(+), 3 deletions(-) rename recipe/fully_async_policy/{dapo_7b_math_fsdp2_4_12.sh => dapo_7b_math_fsdp2_4_4.sh} (98%) create mode 100644 recipe/fully_async_policy/dapo_7b_math_fsdp2_colocate.sh diff --git a/recipe/fully_async_policy/dapo_7b_math_fsdp2_4_12.sh b/recipe/fully_async_policy/dapo_7b_math_fsdp2_4_4.sh similarity index 98% rename from recipe/fully_async_policy/dapo_7b_math_fsdp2_4_12.sh rename to recipe/fully_async_policy/dapo_7b_math_fsdp2_4_4.sh index fe490af24ea..bb3eb5cc88b 100644 --- a/recipe/fully_async_policy/dapo_7b_math_fsdp2_4_12.sh +++ b/recipe/fully_async_policy/dapo_7b_math_fsdp2_4_4.sh @@ -2,7 +2,7 @@ set -xeuo pipefail project_name='DAPO' -exp_name='DAPO-Qwen2.5-7b-MATH-0527a1-fsdp2-one-step-off-4-12' +exp_name='DAPO-Qwen2.5-7b-MATH-0527a1-fsdp2-fully-async-4-4' # Ray # RAY_ADDRESS=${RAY_ADDRESS:-"http://localhost:8265"} @@ -77,8 +77,8 @@ gen_prompt_bsz=1 n_resp_per_prompt=16 train_prompt_mini_bsz=32 staleness_threshold=1 -total_rollout_steps=$(((512*16*100))) -trigger_parameter_sync_step=4 +total_rollout_steps=$(((512*16*10))) +trigger_parameter_sync_step=24 partial_rollout=True /home/hadoop-djst-algoplat/miniconda3/bin/python -m recipe.fully_async_policy.fully_async_main \ diff --git a/recipe/fully_async_policy/dapo_7b_math_fsdp2_colocate.sh b/recipe/fully_async_policy/dapo_7b_math_fsdp2_colocate.sh new file mode 100644 index 00000000000..938a6d65c32 --- /dev/null +++ b/recipe/fully_async_policy/dapo_7b_math_fsdp2_colocate.sh @@ -0,0 +1,136 @@ +#!/usr/bin/env bash +set -xeuo pipefail + +project_name='DAPO' +exp_name='DAPO-Qwen2.5-7b-MATH-0527a1-fsdp2-colocate' + +adv_estimator=grpo + +use_kl_in_reward=False +kl_coef=0.0 +use_kl_loss=False +kl_loss_coef=0.0 + +clip_ratio_low=0.2 +clip_ratio_high=0.28 + +max_prompt_length=$((1024 * 2)) +max_response_length=$((1024 * 8)) +enable_overlong_buffer=True +overlong_buffer_len=$((1024 * 4)) +overlong_penalty_factor=1.0 + +loss_agg_mode="token-mean" + +train_prompt_bsz=512 +n_resp_per_prompt=16 +train_prompt_mini_bsz=32 + +# Ray +# RAY_ADDRESS=${RAY_ADDRESS:-"http://localhost:8265"} +# WORKING_DIR=${WORKING_DIR:-"${PWD}"} +# RUNTIME_ENV=${RUNTIME_ENV:-"${WORKING_DIR}/verl/trainer/runtime_env.yaml"} +NNODES=${NNODES:-1} +NGPUS_PER_NODE=${NGPUS_PER_NODE:-8} +# Paths +RAY_DATA_HOME=${RAY_DATA_HOME:-"${HOME}/verl"} +# very important! please modify the max_position_embeddings in config.json to 32768 after downloading from huggingface +MODEL_PATH=${MODEL_PATH:-"${RAY_DATA_HOME}/models/Qwen2.5-Math-7B"} +CKPTS_DIR=${CKPTS_DIR:-"${RAY_DATA_HOME}/ckpts/${project_name}/${exp_name}"} +TRAIN_FILE=${TRAIN_FILE:-"${RAY_DATA_HOME}/data/dapo-math-17k.parquet"} +TEST_FILE=${TEST_FILE:-"${RAY_DATA_HOME}/data/aime-2024.parquet"} + +MODEL_PATH=/mnt/dolphinfs/hdd_pool/docker/user/hadoop-djst-algoplat/houzhenggang/modelscope/hub/models/Qwen/Qwen2___5-Math-7B +CKPTS_DIR=./ckpts/${project_name}/${exp_name} +TRAIN_FILE=/mnt/dolphinfs/hdd_pool/docker/user/hadoop-djst-algoplat/houzhenggang/data/dapo/dapo-math-17k.parquet +TEST_FILE=/mnt/dolphinfs/hdd_pool/docker/user/hadoop-djst-algoplat/houzhenggang/data/dapo/aime-2024.parquet +# Algorithm +temperature=1.0 +top_p=1.0 +top_k=-1 # 0 for HF rollout, -1 for vLLM rollout +val_top_p=0.7 + +# Performance Related Parameter +use_dynamic_bsz=True +actor_ppo_max_token_len=$(((max_prompt_length + max_response_length) * 2)) +infer_ppo_max_token_len=$(((max_prompt_length + max_response_length) * 3)) +offload=True +gen_tp=1 +sp_size=1 +fsdp_size=2 + +# reference run wandb: https://wandb.ai/verl-org/DAPO%20Reproduction%20on%20verl/runs/ow47vvon?nw=nwusertongyuxuan361 + +/home/hadoop-djst-algoplat/miniconda3/bin/python -m verl.trainer.main_ppo \ + data.train_files="${TRAIN_FILE}" \ + data.val_files="${TEST_FILE}" \ + data.prompt_key=prompt \ + data.truncation='left' \ + data.max_prompt_length=${max_prompt_length} \ + data.max_response_length=${max_response_length} \ + data.train_batch_size=${train_prompt_bsz} \ + actor_rollout_ref.rollout.n=${n_resp_per_prompt} \ + algorithm.adv_estimator=${adv_estimator} \ + algorithm.use_kl_in_reward=${use_kl_in_reward} \ + algorithm.kl_ctrl.kl_coef=${kl_coef} \ + actor_rollout_ref.actor.strategy=fsdp2 \ + critic.strategy=fsdp2 \ + actor_rollout_ref.actor.use_kl_loss=${use_kl_loss} \ + actor_rollout_ref.actor.kl_loss_coef=${kl_loss_coef} \ + actor_rollout_ref.actor.clip_ratio_low=${clip_ratio_low} \ + actor_rollout_ref.actor.clip_ratio_high=${clip_ratio_high} \ + actor_rollout_ref.actor.clip_ratio_c=10.0 \ + actor_rollout_ref.model.use_remove_padding=True \ + +actor_rollout_ref.model.override_config.max_position_embeddings=32768 \ + actor_rollout_ref.actor.use_dynamic_bsz=${use_dynamic_bsz} \ + actor_rollout_ref.ref.log_prob_use_dynamic_bsz=${use_dynamic_bsz} \ + actor_rollout_ref.rollout.log_prob_use_dynamic_bsz=${use_dynamic_bsz} \ + actor_rollout_ref.actor.ppo_max_token_len_per_gpu=${actor_ppo_max_token_len} \ + actor_rollout_ref.ref.log_prob_max_token_len_per_gpu=${infer_ppo_max_token_len} \ + actor_rollout_ref.rollout.log_prob_max_token_len_per_gpu=${infer_ppo_max_token_len} \ + actor_rollout_ref.model.path="${MODEL_PATH}" \ + actor_rollout_ref.model.enable_gradient_checkpointing=True \ + actor_rollout_ref.actor.optim.lr=1e-6 \ + actor_rollout_ref.actor.optim.lr_warmup_steps=10 \ + actor_rollout_ref.actor.optim.weight_decay=0.1 \ + actor_rollout_ref.actor.ppo_mini_batch_size=${train_prompt_mini_bsz} \ + actor_rollout_ref.actor.fsdp_config.param_offload=${offload} \ + actor_rollout_ref.actor.fsdp_config.optimizer_offload=${offload} \ + actor_rollout_ref.actor.entropy_coeff=0 \ + actor_rollout_ref.actor.grad_clip=1.0 \ + actor_rollout_ref.actor.loss_agg_mode=${loss_agg_mode} \ + actor_rollout_ref.actor.ulysses_sequence_parallel_size=${sp_size} \ + actor_rollout_ref.rollout.gpu_memory_utilization=0.80 \ + actor_rollout_ref.rollout.tensor_model_parallel_size=${gen_tp} \ + actor_rollout_ref.rollout.enable_chunked_prefill=True \ + actor_rollout_ref.rollout.max_num_batched_tokens=$((max_prompt_length + max_response_length)) \ + actor_rollout_ref.rollout.temperature=${temperature} \ + actor_rollout_ref.rollout.top_p=${top_p} \ + actor_rollout_ref.rollout.top_k=${top_k} \ + actor_rollout_ref.rollout.val_kwargs.temperature=${temperature} \ + actor_rollout_ref.rollout.val_kwargs.top_p=${val_top_p} \ + actor_rollout_ref.rollout.val_kwargs.top_k=${top_k} \ + actor_rollout_ref.rollout.val_kwargs.do_sample=True \ + actor_rollout_ref.rollout.val_kwargs.n=1 \ + actor_rollout_ref.ref.fsdp_config.param_offload=${offload} \ + actor_rollout_ref.ref.ulysses_sequence_parallel_size=${sp_size} \ + actor_rollout_ref.actor.fsdp_config.fsdp_size=${fsdp_size} \ + reward_model.reward_manager=dapo \ + +reward_model.reward_kwargs.overlong_buffer_cfg.enable=${enable_overlong_buffer} \ + +reward_model.reward_kwargs.overlong_buffer_cfg.len=${overlong_buffer_len} \ + +reward_model.reward_kwargs.overlong_buffer_cfg.penalty_factor=${overlong_penalty_factor} \ + +reward_model.reward_kwargs.overlong_buffer_cfg.log=False \ + +reward_model.reward_kwargs.max_resp_len=${max_response_length} \ + trainer.logger=['console','tensorboard'] \ + trainer.project_name="${project_name}" \ + trainer.experiment_name="${exp_name}" \ + trainer.n_gpus_per_node="${NGPUS_PER_NODE}" \ + trainer.nnodes="${NNODES}" \ + trainer.val_before_train=True \ + trainer.test_freq=-1 \ + trainer.save_freq=-1 \ + trainer.total_epochs=10 \ + trainer.total_training_steps=10 \ + trainer.default_local_dir="${CKPTS_DIR}" \ + trainer.resume_mode=auto \ + trainer.log_val_generations=10 diff --git a/recipe/fully_async_policy/fully_async_main.py b/recipe/fully_async_policy/fully_async_main.py index 1d4e64b1ca4..cce57501a3b 100644 --- a/recipe/fully_async_policy/fully_async_main.py +++ b/recipe/fully_async_policy/fully_async_main.py @@ -272,7 +272,10 @@ def main(config): # Ensure async training config exists if not hasattr(config, "async_training"): raise RuntimeError("must set async_training config") + from time import time + start_time = time() run_ppo(config, task_runner_class=FullyAsyncTaskRunner) + print(f"total time: {time() - start_time:.2f} seconds") if __name__ == "__main__": diff --git a/verl/trainer/main_ppo.py b/verl/trainer/main_ppo.py index fa12105f07f..7b34cbfaf23 100644 --- a/verl/trainer/main_ppo.py +++ b/verl/trainer/main_ppo.py @@ -37,7 +37,10 @@ def main(config): Args: config_dict: Hydra configuration dictionary containing training parameters. """ + from time import time + start_time = time() run_ppo(config) + print(f"total time: {time() - start_time:.2f} seconds") # Define a function to run the PPO-like training process From eb67390abd654bc0abf991bf1f5d623f639a1382 Mon Sep 17 00:00:00 2001 From: ArronHZG Date: Mon, 25 Aug 2025 10:37:29 +0800 Subject: [PATCH 073/182] eval code --- .../dapo_7b_math_fsdp2_4_4.sh | 5 +- .../dapo_7b_math_fsdp2_server.sh | 148 +++++++ recipe/fully_async_policy/detach_utils.py | 8 +- .../fully_async_rollouter.py | 97 +++-- .../fully_async_policy/fully_async_trainer.py | 19 +- recipe/fully_async_policy/message_queue.py | 22 ++ recipe/fully_async_policy/param_sync.py | 5 +- .../unittest/ray_async_resource_config.py | 366 ------------------ tests/special_e2e/run_fully_async_policy.sh | 16 +- verl/experimental/agent_loop/agent_loop.py | 3 +- .../agent_loop/single_turn_agent_loop.py | 6 +- .../agent_loop/tool_agent_loop.py | 7 +- 12 files changed, 275 insertions(+), 427 deletions(-) create mode 100644 recipe/fully_async_policy/dapo_7b_math_fsdp2_server.sh delete mode 100644 recipe/fully_async_policy/unittest/ray_async_resource_config.py diff --git a/recipe/fully_async_policy/dapo_7b_math_fsdp2_4_4.sh b/recipe/fully_async_policy/dapo_7b_math_fsdp2_4_4.sh index bb3eb5cc88b..936d9475d4d 100644 --- a/recipe/fully_async_policy/dapo_7b_math_fsdp2_4_4.sh +++ b/recipe/fully_async_policy/dapo_7b_math_fsdp2_4_4.sh @@ -75,10 +75,11 @@ n_gpus_training=$((NGPUS_PER_NODE - n_gpus_rollout)) train_prompt_bsz=0 gen_prompt_bsz=1 n_resp_per_prompt=16 -train_prompt_mini_bsz=32 +train_prompt_mini_bsz=64 staleness_threshold=1 total_rollout_steps=$(((512*16*10))) -trigger_parameter_sync_step=24 +test_freq=-1 +trigger_parameter_sync_step=32 partial_rollout=True /home/hadoop-djst-algoplat/miniconda3/bin/python -m recipe.fully_async_policy.fully_async_main \ diff --git a/recipe/fully_async_policy/dapo_7b_math_fsdp2_server.sh b/recipe/fully_async_policy/dapo_7b_math_fsdp2_server.sh new file mode 100644 index 00000000000..087dea05121 --- /dev/null +++ b/recipe/fully_async_policy/dapo_7b_math_fsdp2_server.sh @@ -0,0 +1,148 @@ +#!/usr/bin/env bash +set -xeuo pipefail + +project_name='DAPO' +exp_name='DAPO-Qwen2.5-7b-MATH-0527a1-fsdp2-server' + + +rollout_mode="async" +rollout_name="vllm" # sglang or vllm +if [ "$rollout_mode" = "async" ]; then + export VLLM_USE_V1=1 + return_raw_chat="True" +fi + + +adv_estimator=grpo + +use_kl_in_reward=False +kl_coef=0.0 +use_kl_loss=False +kl_loss_coef=0.0 + +clip_ratio_low=0.2 +clip_ratio_high=0.28 + +max_prompt_length=$((1024 * 2)) +max_response_length=$((1024 * 8)) +enable_overlong_buffer=True +overlong_buffer_len=$((1024 * 4)) +overlong_penalty_factor=1.0 + +loss_agg_mode="token-mean" + +train_prompt_bsz=512 +n_resp_per_prompt=16 +train_prompt_mini_bsz=32 + +# Ray +# RAY_ADDRESS=${RAY_ADDRESS:-"http://localhost:8265"} +# WORKING_DIR=${WORKING_DIR:-"${PWD}"} +# RUNTIME_ENV=${RUNTIME_ENV:-"${WORKING_DIR}/verl/trainer/runtime_env.yaml"} +NNODES=${NNODES:-1} +NGPUS_PER_NODE=${NGPUS_PER_NODE:-8} +# Paths +RAY_DATA_HOME=${RAY_DATA_HOME:-"${HOME}/verl"} +# very important! please modify the max_position_embeddings in config.json to 32768 after downloading from huggingface +MODEL_PATH=${MODEL_PATH:-"${RAY_DATA_HOME}/models/Qwen2.5-Math-7B"} +CKPTS_DIR=${CKPTS_DIR:-"${RAY_DATA_HOME}/ckpts/${project_name}/${exp_name}"} +TRAIN_FILE=${TRAIN_FILE:-"${RAY_DATA_HOME}/data/dapo-math-17k.parquet"} +TEST_FILE=${TEST_FILE:-"${RAY_DATA_HOME}/data/aime-2024.parquet"} + +MODEL_PATH=/mnt/dolphinfs/hdd_pool/docker/user/hadoop-djst-algoplat/houzhenggang/modelscope/hub/models/Qwen/Qwen2___5-Math-7B +CKPTS_DIR=./ckpts/${project_name}/${exp_name} +TRAIN_FILE=/mnt/dolphinfs/hdd_pool/docker/user/hadoop-djst-algoplat/houzhenggang/data/dapo/dapo-math-17k.parquet +TEST_FILE=/mnt/dolphinfs/hdd_pool/docker/user/hadoop-djst-algoplat/houzhenggang/data/dapo/aime-2024.parquet +# Algorithm +temperature=1.0 +top_p=1.0 +top_k=-1 # 0 for HF rollout, -1 for vLLM rollout +val_top_p=0.7 + +# Performance Related Parameter +use_dynamic_bsz=True +actor_ppo_max_token_len=$(((max_prompt_length + max_response_length) * 2)) +infer_ppo_max_token_len=$(((max_prompt_length + max_response_length) * 3)) +offload=True +gen_tp=1 +sp_size=1 +fsdp_size=2 + +# reference run wandb: https://wandb.ai/verl-org/DAPO%20Reproduction%20on%20verl/runs/ow47vvon?nw=nwusertongyuxuan361 + +/home/hadoop-djst-algoplat/miniconda3/bin/python -m verl.trainer.main_ppo \ + data.train_files="${TRAIN_FILE}" \ + data.val_files="${TEST_FILE}" \ + data.prompt_key=prompt \ + data.truncation='left' \ + data.max_prompt_length=${max_prompt_length} \ + data.max_response_length=${max_response_length} \ + data.train_batch_size=${train_prompt_bsz} \ + data.return_raw_chat=${return_raw_chat} \ + actor_rollout_ref.rollout.n=${n_resp_per_prompt} \ + algorithm.adv_estimator=${adv_estimator} \ + algorithm.use_kl_in_reward=${use_kl_in_reward} \ + algorithm.kl_ctrl.kl_coef=${kl_coef} \ + actor_rollout_ref.actor.strategy=fsdp2 \ + critic.strategy=fsdp2 \ + actor_rollout_ref.actor.use_kl_loss=${use_kl_loss} \ + actor_rollout_ref.actor.kl_loss_coef=${kl_loss_coef} \ + actor_rollout_ref.actor.clip_ratio_low=${clip_ratio_low} \ + actor_rollout_ref.actor.clip_ratio_high=${clip_ratio_high} \ + actor_rollout_ref.actor.clip_ratio_c=10.0 \ + actor_rollout_ref.model.use_remove_padding=True \ + +actor_rollout_ref.model.override_config.max_position_embeddings=32768 \ + actor_rollout_ref.actor.use_dynamic_bsz=${use_dynamic_bsz} \ + actor_rollout_ref.ref.log_prob_use_dynamic_bsz=${use_dynamic_bsz} \ + actor_rollout_ref.rollout.log_prob_use_dynamic_bsz=${use_dynamic_bsz} \ + actor_rollout_ref.actor.ppo_max_token_len_per_gpu=${actor_ppo_max_token_len} \ + actor_rollout_ref.ref.log_prob_max_token_len_per_gpu=${infer_ppo_max_token_len} \ + actor_rollout_ref.rollout.log_prob_max_token_len_per_gpu=${infer_ppo_max_token_len} \ + actor_rollout_ref.model.path="${MODEL_PATH}" \ + actor_rollout_ref.model.enable_gradient_checkpointing=True \ + actor_rollout_ref.actor.optim.lr=1e-6 \ + actor_rollout_ref.actor.optim.lr_warmup_steps=10 \ + actor_rollout_ref.actor.optim.weight_decay=0.1 \ + actor_rollout_ref.actor.ppo_mini_batch_size=${train_prompt_mini_bsz} \ + actor_rollout_ref.actor.fsdp_config.param_offload=${offload} \ + actor_rollout_ref.actor.fsdp_config.optimizer_offload=${offload} \ + actor_rollout_ref.actor.entropy_coeff=0 \ + actor_rollout_ref.actor.grad_clip=1.0 \ + actor_rollout_ref.actor.loss_agg_mode=${loss_agg_mode} \ + actor_rollout_ref.actor.ulysses_sequence_parallel_size=${sp_size} \ + actor_rollout_ref.rollout.gpu_memory_utilization=0.80 \ + actor_rollout_ref.rollout.tensor_model_parallel_size=${gen_tp} \ + actor_rollout_ref.rollout.enable_chunked_prefill=True \ + actor_rollout_ref.rollout.max_num_batched_tokens=$((max_prompt_length + max_response_length)) \ + actor_rollout_ref.rollout.temperature=${temperature} \ + actor_rollout_ref.rollout.top_p=${top_p} \ + actor_rollout_ref.rollout.top_k=${top_k} \ + actor_rollout_ref.rollout.val_kwargs.temperature=${temperature} \ + actor_rollout_ref.rollout.val_kwargs.top_p=${val_top_p} \ + actor_rollout_ref.rollout.val_kwargs.top_k=${top_k} \ + actor_rollout_ref.rollout.val_kwargs.do_sample=True \ + actor_rollout_ref.rollout.val_kwargs.n=1 \ + actor_rollout_ref.rollout.name=${rollout_name} \ + actor_rollout_ref.rollout.mode=${rollout_mode} \ + actor_rollout_ref.ref.fsdp_config.param_offload=${offload} \ + actor_rollout_ref.ref.ulysses_sequence_parallel_size=${sp_size} \ + actor_rollout_ref.actor.fsdp_config.fsdp_size=${fsdp_size} \ + reward_model.reward_manager=dapo \ + +reward_model.reward_kwargs.overlong_buffer_cfg.enable=${enable_overlong_buffer} \ + +reward_model.reward_kwargs.overlong_buffer_cfg.len=${overlong_buffer_len} \ + +reward_model.reward_kwargs.overlong_buffer_cfg.penalty_factor=${overlong_penalty_factor} \ + +reward_model.reward_kwargs.overlong_buffer_cfg.log=False \ + +reward_model.reward_kwargs.max_resp_len=${max_response_length} \ + trainer.logger=['console','tensorboard'] \ + trainer.project_name="${project_name}" \ + trainer.experiment_name="${exp_name}" \ + trainer.n_gpus_per_node="${NGPUS_PER_NODE}" \ + trainer.nnodes="${NNODES}" \ + trainer.val_before_train=True \ + trainer.test_freq=-1 \ + trainer.save_freq=-1 \ + trainer.total_epochs=10 \ + trainer.total_training_steps=10 \ + trainer.default_local_dir="${CKPTS_DIR}" \ + trainer.resume_mode=auto \ + trainer.log_val_generations=10 diff --git a/recipe/fully_async_policy/detach_utils.py b/recipe/fully_async_policy/detach_utils.py index 3ac998bc82a..b8b359f9669 100644 --- a/recipe/fully_async_policy/detach_utils.py +++ b/recipe/fully_async_policy/detach_utils.py @@ -13,7 +13,7 @@ # limitations under the License. import time from dataclasses import dataclass -from typing import Any +from typing import Any, Dict import numpy as np import torch @@ -49,6 +49,12 @@ class RolloutSample: param_version: int +@dataclass +class ValidateMetrics: + timing_raw: Dict[str, Any] + metrics: Dict[str, Any] + + def prepare_single_generation_data(batch_dict, global_steps) -> DataProto: """ 类似 ray_trainer._prepare_generate_batch 的逻辑,但针对单个样本 diff --git a/recipe/fully_async_policy/fully_async_rollouter.py b/recipe/fully_async_policy/fully_async_rollouter.py index 0982954db75..0770c7bc6c1 100644 --- a/recipe/fully_async_policy/fully_async_rollouter.py +++ b/recipe/fully_async_policy/fully_async_rollouter.py @@ -22,11 +22,12 @@ from recipe.fully_async_policy.detach_utils import ( RolloutSample, calculate_one_step_size, - prepare_single_generation_data, + prepare_single_generation_data, ValidateMetrics, ) from recipe.fully_async_policy.message_queue import MessageQueueClient from verl.single_controller.ray import RayClassWithInitArgs, RayWorkerGroup from verl.trainer.ppo.ray_trainer import RayPPOTrainer, ResourcePoolManager, Role, WorkerType +from verl.utils.profiler import marked_timer from verl.utils.tracking import ValidationGenerationsLogger @@ -145,7 +146,7 @@ def __init__( f"max_required_samples: {self.max_required_samples}" ) - # 单次最多扔一次迭代需要的样本 + # 单次最多扔一次更新需要的样本 self.max_concurrent_samples = self.required_samples # 流式处理统计 @@ -167,6 +168,15 @@ def __init__( # 通过 pause 和 resume 控制 monitor_loop 中,是否进行 尝试恢复 操作 self.monitor_loop_trigger = True + self.update_param_version_time = 0 + self.global_steps = 0 + + self.progress_bar = tqdm( + total=self.total_rollout_steps / ( + self.required_samples * self.config.async_training.trigger_parameter_sync_step), + initial=self.global_steps, desc="Training Progress" + ) + async def set_message_queue_client(self, message_queue_client: MessageQueueClient): """Set message queue client""" async with self.lock: @@ -195,6 +205,17 @@ async def update_param_version(self, version: int): f"[FullyAsyncRollouter][Public][update_param_version] " f"Parameter version updated from {old_version} to {version}" ) + timing_raw = {} + self.update_param_version_time += 1 + if (self.val_reward_fn is not None + and self.config.trainer.test_freq > 0 + and (self.is_last_step or self.global_steps % self.config.trainer.test_freq == 0)): + with marked_timer("testing", timing_raw, color="green"): + val_metrics: dict = self._validate() + data = ValidateMetrics(timing_raw=timing_raw, metrics=val_metrics) + self.message_queue_client.put_validate(ray.cloudpickle.dumps(data)) + if version > 0: + self.progress_bar.update(1) def _validate_config(self): # Validate asynchronous training configuration @@ -245,10 +266,6 @@ async def _feed_samples(self): sample_count = 0 should_stop = False - progress_bar = tqdm( - total=self.total_rollout_steps / self.required_samples, initial=self.global_steps, desc="Training Progress" - ) - for epoch, batch_dict in continuous_iterator: if should_stop: # 检查停止标志 break @@ -275,6 +292,7 @@ async def _feed_samples(self): # 检查是否到达最后一步 if self.global_steps >= self.total_rollout_steps: + self.is_last_step = True print( f"[FullyAsyncRollouter][Feed] " f"达到最大步数,停止添加新样本 " @@ -283,15 +301,13 @@ async def _feed_samples(self): should_stop = True # 设置停止标志 break - if self.global_steps % self.required_samples == 0: - progress_bar.update(1) self.global_steps += 1 sample_count += 1 # 发送结束信号 progress_bar.close() - await self.pending_queue.put("DONE") + await self.pending_should_stopqueue.put("DONE") print(f"[FullyAsyncRollouter][Feed] 样本添加完成,总共添加了 {self.global_steps} 个步骤的样本") async def _processor_worker(self): @@ -377,12 +393,12 @@ async def _process_single_sample_streaming(self, rollout_sample: RolloutSample): rollout_sample.processing_time += processing_time rollout_sample.param_version = self.current_param_version - # print( - # f"[FullyAsyncRollouter] rollout {rollout_sample.sample_id} " - # f"cost {processing_time:.2f}s " - # f"len: {len(rollout_sample.agent_loop_output.response_ids)} " - # f"cancel: {agent_loop_output.is_cancel} " - # ) + print( + f"[FullyAsyncRollouter] rollout {rollout_sample.sample_id} " + f"cost {processing_time:.2f}s " + f"len: {len(rollout_sample.agent_loop_output.response_ids)} " + f"cancel: {agent_loop_output.is_cancel} " + ) if agent_loop_output.is_cancel: # 放入 cancel 队列中,等待恢复生成 @@ -411,8 +427,6 @@ async def _consumer_worker(self): else: self.dropped_stale_samples += 1 - # print(f"[FullyAsyncRollouter] submit {rollout_sample.sample_id} {'success' if success else 'error'}") - # 标记结果队列任务完成 self.result_queue.task_done() @@ -427,20 +441,19 @@ async def _streaming_generation_main(self): config=OmegaConf.to_container(self.config, resolve=True), ) - self.global_steps = 0 - # load checkpoint before doing anything self._load_checkpoint() # perform validation before training # currently, we only support validation using the reward_function. - if self.val_reward_fn is not None and self.config.trainer.get("val_before_train", True): - val_metrics = self._validate() - assert val_metrics, f"{val_metrics=}" - pprint(f"[FullyAsyncRollouter] Initial validation metrics: {val_metrics}") - self.logger.log(data=val_metrics, step=self.global_steps) - if self.config.trainer.get("val_only", False): - return + async with self.lock: + if self.val_reward_fn is not None and self.config.trainer.get("val_before_train", True): + print("Initial validation metric") + val_metrics = self._validate() + assert val_metrics, f"{val_metrics=}" + pprint(f"[FullyAsyncRollouter] Initial validation metrics: {val_metrics}") + if self.config.trainer.get("val_only", False): + return # we start from step 1 self.global_steps += 1 @@ -570,26 +583,30 @@ async def _should_pause_generation(self) -> bool: version_diff = self.current_param_version - current_trainer_version if version_diff > self.staleness_threshold: - print( - "[FullyAsyncRollouter][ShouldPause] " - f"due to version_diff > self.staleness_threshold: " - f"rollout_version={self.current_param_version}, " - f"trainer_version={current_trainer_version}, diff={version_diff}" - ) + if not self.paused: + print( + "[FullyAsyncRollouter][ShouldPause] " + f"due to version_diff > self.staleness_threshold: " + f"rollout_version={self.current_param_version}, " + f"trainer_version={current_trainer_version}, diff={version_diff}" + ) return True if queue_size >= self.max_queue_size: - print( - f"[FullyAsyncRollouter][ShouldPause] due to full queue: size={queue_size}, max={self.max_queue_size}" - ) + if not self.paused: + print( + f"[FullyAsyncRollouter][ShouldPause] " + f"due to full queue: size={queue_size}, max={self.max_queue_size}" + ) return True if self.staleness_samples > self.max_required_samples: - print( - "[FullyAsyncRollouter][ShouldPause] " - f"due to " - f"staleness_samples {self.staleness_samples} > max_required_samples {self.max_required_samples} " - ) + if not self.paused: + print( + "[FullyAsyncRollouter][ShouldPause] " + f"due to " + f"staleness_samples {self.staleness_samples} > max_required_samples {self.max_required_samples} " + ) return True return False diff --git a/recipe/fully_async_policy/fully_async_trainer.py b/recipe/fully_async_policy/fully_async_trainer.py index b82b1c4d5d2..d6d44babb2a 100644 --- a/recipe/fully_async_policy/fully_async_trainer.py +++ b/recipe/fully_async_policy/fully_async_trainer.py @@ -15,6 +15,7 @@ import logging import time import warnings +from datetime import datetime from pprint import pprint from typing import Any @@ -23,7 +24,7 @@ from recipe.fully_async_policy.detach_utils import ( assemble_batch_from_rollout_samples, - calculate_one_step_size, + calculate_one_step_size, ValidateMetrics, ) from recipe.fully_async_policy.message_queue import MessageQueueClient from verl.single_controller.ray import RayClassWithInitArgs, RayWorkerGroup @@ -156,7 +157,7 @@ def _get_samples_from_queue(self) -> tuple[None, None] | tuple[int, Any]: queue_samples.append(sample) - if len(queue_samples) % 10 == 0: + if len(queue_samples) % 64 == 0: print( f"[FullyAsyncTrainer] Collected {len(queue_samples)}/{self.required_samples} samples. " f"mq_len: {queue_len}" @@ -251,6 +252,12 @@ def fit(self): metrics = {} timing_raw = {} + val_data = self.message_queue_client.get_validate_sync() + if val_data: + val_data: ValidateMetrics = ray.cloudpickle.loads(val_data) + metrics.update(val_data.metrics) + timing_raw.update(val_data.timing_raw) + with marked_timer("step", timing_raw): with marked_timer("gen", timing_raw, color="red"): epoch, batch = self._get_samples_from_queue() @@ -285,13 +292,17 @@ def fit(self): self._log_rollout(batch, reward_extra_infos_dict, timing_raw) self._check_save_checkpoint(False, timing_raw) - # self._collect_metrics(batch, epoch, metrics, timing_raw) + self._collect_metrics(batch, 0, metrics, timing_raw) pprint(metrics) # Trigger parameter synchronization after training step + + time_str = datetime.now().strftime("%H:%M:%S.%f")[:-3] + print( f"[FullyAsyncTrainer] global_steps: {self.global_steps} " f"local_trigger_step: {self.local_trigger_step} " - f"trigger_parameter_sync_step: {self.trigger_parameter_sync_step}" + f"trigger_parameter_sync_step: {self.trigger_parameter_sync_step} " + f"{time_str}" ) self._trigger_parameter_sync_after_step() self.global_steps += 1 diff --git a/recipe/fully_async_policy/message_queue.py b/recipe/fully_async_policy/message_queue.py index 012445d45ed..6a425c50478 100644 --- a/recipe/fully_async_policy/message_queue.py +++ b/recipe/fully_async_policy/message_queue.py @@ -36,6 +36,9 @@ def __init__(self, config: DictConfig, max_queue_size: int = 1000): self.queue = deque(maxlen=max_queue_size) self.current_param_version = 0 + self.val_queue = deque() + + try: if hasattr(config, "async_training") and config.async_training is not None: self.staleness_threshold = getattr(config.async_training, "staleness_threshold", 3) @@ -188,6 +191,18 @@ async def get_memory_usage(self) -> dict: "estimated_memory_mb": total_size / (1024 * 1024), } + async def put_validate(self, data): + async with self._lock: + self.val_queue.append(data) + + async def get_validate(self): + async with self._lock: + if self.val_queue: + return self.val_queue.popleft() + else: + return None + + class MessageQueueClient: """Asyncio-compatible MessageQueue client for communicating with MessageQueue Actor""" @@ -200,6 +215,13 @@ async def put_sample(self, sample: Any, param_version: int) -> bool: future = self.queue_actor.put_sample.remote(sample, param_version) return await asyncio.wrap_future(future.future()) + async def put_validate(self, data: Any) -> bool: + future = self.queue_actor.put_validate.remote(data) + return await asyncio.wrap_future(future.future()) + + def get_validate_sync(self) -> Any | None: + return ray.get(self.queue_actor.get_validate.remote()) + async def get_sample(self) -> Any | None: """Get single sample from queue, wait until one is available (async)""" future = self.queue_actor.get_sample.remote() diff --git a/recipe/fully_async_policy/param_sync.py b/recipe/fully_async_policy/param_sync.py index 53ced11956c..7e75865ebd5 100644 --- a/recipe/fully_async_policy/param_sync.py +++ b/recipe/fully_async_policy/param_sync.py @@ -86,8 +86,9 @@ def sync_weights(self, version): self.actor_wg.sync_rollout_weights() ray.get(self.rollout_wg.sync_rollout_weights()) - # Update rollout version - ray.get(self.rollouter.update_param_version.remote(version)) + # Async Update rollout version + self.rollouter.update_param_version.remote(version) + ray.get(self.rollouter.resume.remote()) end_time = time.time() diff --git a/recipe/fully_async_policy/unittest/ray_async_resource_config.py b/recipe/fully_async_policy/unittest/ray_async_resource_config.py deleted file mode 100644 index 930f8c5169f..00000000000 --- a/recipe/fully_async_policy/unittest/ray_async_resource_config.py +++ /dev/null @@ -1,366 +0,0 @@ -# Copyright 2025 Meituan Ltd. and/or its affiliates -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import asyncio -import random -import time - -import ray - - -# 配置1: 默认配置 -class DefaultStreamingActor: - """默认配置的流式处理Actor""" - - def __init__(self, actor_id: str): - self.actor_id = actor_id - self.processed_count = 0 - self.start_time = time.time() - self.max_concurrent_tasks = 0 - self.current_tasks = 0 - - async def process_data_async(self, data_item: dict) -> dict: - """异步处理数据""" - self.current_tasks += 1 - self.max_concurrent_tasks = max(self.max_concurrent_tasks, self.current_tasks) - - try: - task_id = data_item["id"] - processing_time = random.uniform(1, 3) - - print(f"[{self.actor_id}] 开始处理 {task_id} (当前并发: {self.current_tasks})") - - # CPU密集型任务模拟 - await asyncio.sleep(processing_time * 0.5) # I/O部分 - - # 模拟CPU计算 - total = 0 - for i in range(int(processing_time * 100000)): # CPU密集计算 - total += i * 0.001 - - await asyncio.sleep(processing_time * 0.5) # 更多I/O - - self.processed_count += 1 - - result = { - "id": task_id, - "actor_id": self.actor_id, - "processing_time": processing_time, - "processed_count": self.processed_count, - "max_concurrent": self.max_concurrent_tasks, - "compute_result": total, - "completed_at": time.time(), - } - - print(f"[{self.actor_id}] 完成处理 {task_id} (耗时: {processing_time:.1f}s)") - return result - - finally: - self.current_tasks -= 1 - - def get_stats(self) -> dict: - return { - "actor_id": self.actor_id, - "processed_count": self.processed_count, - "max_concurrent_tasks": self.max_concurrent_tasks, - "uptime": time.time() - self.start_time, - } - - -# 配置2: 只设置 num_cpus -@ray.remote(num_cpus=4) -class HighCpuStreamingActor(DefaultStreamingActor): - """高CPU配置的Actor""" - - pass - - -# 配置3: 只设置 max_concurrency -@ray.remote(max_concurrency=5) -class HighConcurrencyStreamingActor(DefaultStreamingActor): - """高并发配置的Actor""" - - pass - - -# 配置4: 同时设置两者 -@ray.remote(num_cpus=4, max_concurrency=8) -class OptimalStreamingActor(DefaultStreamingActor): - """最优配置的Actor""" - - pass - - -# 配置5: 极端低配置 -@ray.remote(num_cpus=1, max_concurrency=2) -class LowResourceStreamingActor(DefaultStreamingActor): - """低资源配置的Actor""" - - pass - - -class RayStreamingSystemTest: - """Ray流式处理系统测试""" - - def __init__(self): - self.test_data = [] - self.results = {} - - def generate_test_data(self, count: int = 20) -> list[dict]: - """生成测试数据""" - return [ - {"id": f"task_{i:03d}", "content": f"测试数据_{i}", "priority": random.choice(["high", "normal", "low"])} - for i in range(count) - ] - - async def test_actor_configuration(self, actor_class, config_name: str, test_data: list[dict]) -> dict: - """测试特定配置的Actor""" - print(f"\n{'=' * 60}") - print(f"测试配置: {config_name}") - print(f"{'=' * 60}") - - # 创建Actor实例 - actor = actor_class.remote(config_name) - - start_time = time.time() - - # 并发提交所有任务 - print(f"提交 {len(test_data)} 个任务...") - task_futures = [] - - for i, data_item in enumerate(test_data): - future = actor.process_data_async.remote(data_item) - task_futures.append(future) - - # 模拟流式数据到达 - if i < len(test_data) - 1: - await asyncio.sleep(0.1) # 100ms间隔 - - print("所有任务已提交,等待完成...") - - # 等待所有任务完成 - try: - results = await asyncio.gather(*[asyncio.wrap_future(future.future()) for future in task_futures]) - except Exception as e: - print(f"任务执行出错: {e}") - results = [] - - end_time = time.time() - total_time = end_time - start_time - - # 获取Actor统计信息 - stats = ray.get(actor.get_stats.remote()) - - # 计算性能指标 - performance_metrics = { - "config_name": config_name, - "total_tasks": len(test_data), - "completed_tasks": len(results), - "total_time": total_time, - "throughput": len(results) / total_time if total_time > 0 else 0, - "avg_processing_time": sum(r.get("processing_time", 0) for r in results) / len(results) if results else 0, - "max_concurrent_tasks": stats["max_concurrent_tasks"], - "actor_stats": stats, - "success_rate": len(results) / len(test_data) if test_data else 0, - } - - print(f"✅ 完成测试 {config_name}:") - print(f" 总任务数: {performance_metrics['total_tasks']}") - print(f" 完成任务数: {performance_metrics['completed_tasks']}") - print(f" 总耗时: {performance_metrics['total_time']:.2f}s") - print(f" 吞吐量: {performance_metrics['throughput']:.2f} tasks/s") - print(f" 最大并发: {performance_metrics['max_concurrent_tasks']}") - print(f" 成功率: {performance_metrics['success_rate'] * 100:.1f}%") - - return performance_metrics - - async def run_comprehensive_test(self): - """运行综合测试""" - print("🚀 开始Ray异步资源配置测试") - print(f"Ray集群状态: {ray.cluster_resources()}") - - # 生成测试数据 - test_data = self.generate_test_data(15) # 15个任务便于观察 - - # 测试配置列表 - test_configs = [ - (DefaultStreamingActor, "默认配置 (无特殊设置)"), - (HighCpuStreamingActor, "高CPU配置 (num_cpus=4)"), - (HighConcurrencyStreamingActor, "高并发配置 (max_concurrency=5)"), - (OptimalStreamingActor, "最优配置 (num_cpus=4, max_concurrency=8)"), - (LowResourceStreamingActor, "低资源配置 (num_cpus=1, max_concurrency=2)"), - ] - - results = {} - - # 逐个测试各种配置 - for actor_class, config_name in test_configs: - try: - result = await self.test_actor_configuration(actor_class, config_name, test_data) - results[config_name] = result - - # 测试间隔 - await asyncio.sleep(2) - - except Exception as e: - print(f"❌ 测试 {config_name} 失败: {e}") - results[config_name] = {"error": str(e)} - - # 生成对比报告 - self.generate_comparison_report(results) - - return results - - def generate_comparison_report(self, results: dict): - """生成对比报告""" - print(f"\n{'=' * 80}") - print("📊 配置对比报告") - print(f"{'=' * 80}") - - # 表头 - print(f"{'配置名称':<25} {'吞吐量':<12} {'最大并发':<10} {'平均处理时间':<15} {'成功率':<10}") - print("-" * 80) - - # 数据行 - best_throughput = 0 - best_config = "" - - for config_name, result in results.items(): - if "error" in result: - print(f"{config_name:<25} {'错误':<12} {'':<10} {'':<15} {'':<10}") - continue - - throughput = result.get("throughput", 0) - max_concurrent = result.get("max_concurrent_tasks", 0) - avg_time = result.get("avg_processing_time", 0) - success_rate = result.get("success_rate", 0) - - print( - f"{config_name:<25} {throughput:<12.2f} {max_concurrent:<10} " - f"{avg_time:<15.2f} {success_rate * 100:<10.1f}%" - ) - - if throughput > best_throughput: - best_throughput = throughput - best_config = config_name - - print(f"\n🏆 最佳配置: {best_config} (吞吐量: {best_throughput:.2f} tasks/s)") - - # 详细分析 - print("\n📋 配置分析:") - print("1. num_cpus 作用:") - print(" - 资源预留: 确保Actor有足够计算资源") - print(" - 节点选择: Ray选择有足够CPU的节点") - print(" - 避免资源竞争: 防止过度调度") - - print("\n2. max_concurrency 作用:") - print(" - 并发控制: 限制Actor内同时执行的任务数") - print(" - 内存保护: 防止过多并发导致内存溢出") - print(" - 性能调优: 平衡并发度和资源利用率") - - print("\n3. 建议配置:") - print(" - CPU密集型任务: 设置较高的num_cpus,适中的max_concurrency") - print(" - I/O密集型任务: 设置较低的num_cpus,较高的max_concurrency") - print(" - 混合型任务: 平衡两个参数,根据实际测试调优") - - -async def run_resource_stress_test(): - """运行资源压力测试""" - print(f"\n{'=' * 60}") - print("🔥 资源压力测试") - print(f"{'=' * 60}") - - # 创建多个不同配置的Actor - actors = { - "高并发低CPU": OptimalStreamingActor.remote("stress_test_1"), - "低并发高CPU": ray.remote(num_cpus=8, max_concurrency=2)(DefaultStreamingActor).remote("stress_test_2"), - "平衡配置": ray.remote(num_cpus=2, max_concurrency=4)(DefaultStreamingActor).remote("stress_test_3"), - } - - # 大量并发任务 - heavy_workload = [{"id": f"heavy_{i}", "content": f"重载任务_{i}"} for i in range(50)] - - print("提交大量并发任务,观察资源使用...") - - all_futures = [] - for actor_name, actor in actors.items(): - print(f"向 {actor_name} 提交任务...") - for task in heavy_workload[:15]: # 每个Actor处理15个任务 - future = actor.process_data_async.remote(task) - all_futures.append((actor_name, future)) - - # 等待完成并记录时间 - start_time = time.time() - results = [] - - for actor_name, future in all_futures: - try: - result = await asyncio.wrap_future(future.future()) - results.append((actor_name, result)) - except Exception as e: - print(f"{actor_name} 任务失败: {e}") - - end_time = time.time() - - print(f"压力测试完成,总耗时: {end_time - start_time:.2f}s") - print(f"完成任务数: {len(results)}") - - # 按Actor分组统计 - actor_stats = {} - for actor_name, result in results: - if actor_name not in actor_stats: - actor_stats[actor_name] = [] - actor_stats[actor_name].append(result) - - for actor_name, actor_results in actor_stats.items(): - avg_time = sum(r["processing_time"] for r in actor_results) / len(actor_results) - print(f"{actor_name}: 完成 {len(actor_results)} 个任务, 平均耗时 {avg_time:.2f}s") - - -async def main(): - """主函数""" - # 初始化Ray - if not ray.is_initialized(): - ray.init( - num_cpus=16, # 设置足够的CPU资源 - object_store_memory=2000000000, # 2GB - ignore_reinit_error=True, - ) - - print("🎯 Ray异步资源配置测试") - print(f"可用资源: {ray.cluster_resources()}") - - try: - # 基础配置测试 - test_system = RayStreamingSystemTest() - await test_system.run_comprehensive_test() - - # 压力测试 - await run_resource_stress_test() - - print("\n所有测试完成!") - - except Exception as e: - print(f"测试执行失败: {e}") - import traceback - - traceback.print_exc() - - finally: - # 清理资源 - ray.shutdown() - - -if __name__ == "__main__": - asyncio.run(main()) diff --git a/tests/special_e2e/run_fully_async_policy.sh b/tests/special_e2e/run_fully_async_policy.sh index 8e0b82ddefc..04c17c98c24 100644 --- a/tests/special_e2e/run_fully_async_policy.sh +++ b/tests/special_e2e/run_fully_async_policy.sh @@ -49,15 +49,18 @@ top_k=-1 val_top_p=0.7 # Fully async specific parameters -n_gpus_rollout=6 +n_gpus_rollout=4 n_gpus_training=$((NUM_GPUS - n_gpus_rollout)) train_prompt_bsz=0 gen_prompt_bsz=1 -n_resp_per_prompt=3 -train_prompt_mini_bsz=32 -total_rollout_steps=50000 -staleness_threshold=10 +n_resp_per_prompt=16 +train_prompt_mini_bsz=64 +staleness_threshold=1 +total_rollout_steps=$(((512*16*10))) +test_freq=2 +trigger_parameter_sync_step=2 +partial_rollout=True exp_name="$(basename "${MODEL_ID,,}")-fully-async-policy-${ACTOR_STRATEGY}-minimal" @@ -114,7 +117,7 @@ common_params=( trainer.logger=['console'] trainer.project_name='verl-test-fully-async' trainer.experiment_name="${exp_name}" - trainer.val_before_train=False + trainer.val_before_train=True trainer.test_freq=-1 trainer.save_freq=-1 trainer.resume_mode=disable @@ -126,6 +129,7 @@ common_params=( rollout.total_epochs=2 # Fully async specific configurations async_training.staleness_threshold=${staleness_threshold} + async_training.trigger_parameter_sync_step="${trigger_parameter_sync_step}" ) if [ "${ACTOR_STRATEGY}" == "fsdp2" ]; then diff --git a/verl/experimental/agent_loop/agent_loop.py b/verl/experimental/agent_loop/agent_loop.py index 83ba95c8662..41dc6967b5c 100644 --- a/verl/experimental/agent_loop/agent_loop.py +++ b/verl/experimental/agent_loop/agent_loop.py @@ -444,7 +444,7 @@ async def _run_agent_loop( messages: list[dict[str, Any]], sampling_params: dict[str, Any], trajectory: dict[str, Any], - partial_output: Optional[AgentLoopOutput], + partial_output: Optional[AgentLoopOutput] = None, ) -> AgentLoopOutput: with rollout_trace_attr( step=trajectory["step"], @@ -456,7 +456,6 @@ async def _run_agent_loop( assert agent_name in _agent_loop_registry, ( f"Agent loop {agent_name} not registered, registered agent loops: {_agent_loop_registry.keys()}" ) - agent_loop_config = _agent_loop_registry[agent_name] agent_loop = hydra.utils.instantiate( config=agent_loop_config, diff --git a/verl/experimental/agent_loop/single_turn_agent_loop.py b/verl/experimental/agent_loop/single_turn_agent_loop.py index 411388e7321..492c1894cc5 100644 --- a/verl/experimental/agent_loop/single_turn_agent_loop.py +++ b/verl/experimental/agent_loop/single_turn_agent_loop.py @@ -13,7 +13,7 @@ # limitations under the License. import logging import os -from typing import Any +from typing import Any, Optional from uuid import uuid4 from verl.experimental.agent_loop.agent_loop import AgentLoopBase, AgentLoopOutput, register @@ -32,7 +32,9 @@ def __init__(self, *args, **kwargs): self.prompt_length = self.config.actor_rollout_ref.rollout.prompt_length self.response_length = self.config.actor_rollout_ref.rollout.response_length - async def run(self, messages: list[dict[str, Any]], sampling_params: dict[str, Any]) -> AgentLoopOutput: + async def run(self, messages: list[dict[str, Any]], + sampling_params: dict[str, Any], + output: Optional[AgentLoopOutput]) -> AgentLoopOutput: metrics = {} request_id = uuid4().hex prompt_ids = await self.loop.run_in_executor( diff --git a/verl/experimental/agent_loop/tool_agent_loop.py b/verl/experimental/agent_loop/tool_agent_loop.py index 3437c0be5ab..a0642048dc7 100644 --- a/verl/experimental/agent_loop/tool_agent_loop.py +++ b/verl/experimental/agent_loop/tool_agent_loop.py @@ -15,7 +15,7 @@ import json import logging import os -from typing import Any +from typing import Any, Optional from uuid import uuid4 from verl.experimental.agent_loop.agent_loop import AgentLoopBase, AgentLoopOutput, register @@ -56,7 +56,10 @@ def init_class(cls, config, tokenizer, **kwargs): cls.system_prompt = tokenizer.apply_chat_template([{}], add_generation_prompt=False, tokenize=True) @rollout_trace_op - async def run(self, messages: list[dict[str, Any]], sampling_params: dict[str, Any]) -> AgentLoopOutput: + async def run(self, + messages: list[dict[str, Any]], + sampling_params: dict[str, Any], + output: Optional[AgentLoopOutput]) -> AgentLoopOutput: metrics = {} request_id = uuid4().hex prompt_ids = await self.loop.run_in_executor( From 43883aefa98ece80a1425663bf92bb806ecd1df3 Mon Sep 17 00:00:00 2001 From: ArronHZG Date: Mon, 25 Aug 2025 11:50:14 +0800 Subject: [PATCH 074/182] fix FullyAsyncRollouter --- recipe/fully_async_policy/fully_async_rollouter.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/recipe/fully_async_policy/fully_async_rollouter.py b/recipe/fully_async_policy/fully_async_rollouter.py index 0770c7bc6c1..71645f793df 100644 --- a/recipe/fully_async_policy/fully_async_rollouter.py +++ b/recipe/fully_async_policy/fully_async_rollouter.py @@ -307,7 +307,7 @@ async def _feed_samples(self): # 发送结束信号 progress_bar.close() - await self.pending_should_stopqueue.put("DONE") + await self.pending_queue.put("DONE") print(f"[FullyAsyncRollouter][Feed] 样本添加完成,总共添加了 {self.global_steps} 个步骤的样本") async def _processor_worker(self): From b22826586840d3922edcaccba937bbb0385ccba6 Mon Sep 17 00:00:00 2001 From: ArronHZG Date: Mon, 25 Aug 2025 19:33:22 +0800 Subject: [PATCH 075/182] group batch --- .../dapo_7b_math_fsdp2_4_4.sh | 13 ++- .../dapo_7b_math_fsdp2_colocate.sh | 13 ++- recipe/fully_async_policy/detach_utils.py | 81 ++++++------- recipe/fully_async_policy/fully_async_main.py | 1 + .../fully_async_rollouter.py | 110 ++++++++---------- .../unittest/test_batch_utils.py | 2 +- tests/special_e2e/run_fully_async_policy.sh | 20 ++-- verl/experimental/agent_loop/agent_loop.py | 46 ++++---- .../partial_single_turn_agent_loop.py | 9 +- 9 files changed, 144 insertions(+), 151 deletions(-) diff --git a/recipe/fully_async_policy/dapo_7b_math_fsdp2_4_4.sh b/recipe/fully_async_policy/dapo_7b_math_fsdp2_4_4.sh index bb3eb5cc88b..3bcc82c9cef 100644 --- a/recipe/fully_async_policy/dapo_7b_math_fsdp2_4_4.sh +++ b/recipe/fully_async_policy/dapo_7b_math_fsdp2_4_4.sh @@ -65,7 +65,7 @@ gen_tp=1 sp_size=1 fsdp_size=2 -NNODES=${NNODES:-1} +NNODES=${NNODES:-2} NGPUS_PER_NODE=${NGPUS_PER_NODE:-8} # Fully async specific parameters @@ -75,13 +75,18 @@ n_gpus_training=$((NGPUS_PER_NODE - n_gpus_rollout)) train_prompt_bsz=0 gen_prompt_bsz=1 n_resp_per_prompt=16 -train_prompt_mini_bsz=32 +train_prompt_mini_bsz=64 staleness_threshold=1 total_rollout_steps=$(((512*16*10))) -trigger_parameter_sync_step=24 +trigger_parameter_sync_step=32 partial_rollout=True -/home/hadoop-djst-algoplat/miniconda3/bin/python -m recipe.fully_async_policy.fully_async_main \ +PYTHON_INTERPRETER="/home/hadoop-djst-algoplat/miniconda3/bin/python" +if [ ! -x "$PYTHON_INTERPRETER" ]; then + PYTHON_INTERPRETER="python3" +fi + +$PYTHON_INTERPRETER -m recipe.fully_async_policy.fully_async_main \ data.train_files="${TRAIN_FILE}" \ data.val_files="${TEST_FILE}" \ data.prompt_key=prompt \ diff --git a/recipe/fully_async_policy/dapo_7b_math_fsdp2_colocate.sh b/recipe/fully_async_policy/dapo_7b_math_fsdp2_colocate.sh index 938a6d65c32..951db892651 100644 --- a/recipe/fully_async_policy/dapo_7b_math_fsdp2_colocate.sh +++ b/recipe/fully_async_policy/dapo_7b_math_fsdp2_colocate.sh @@ -30,7 +30,7 @@ train_prompt_mini_bsz=32 # RAY_ADDRESS=${RAY_ADDRESS:-"http://localhost:8265"} # WORKING_DIR=${WORKING_DIR:-"${PWD}"} # RUNTIME_ENV=${RUNTIME_ENV:-"${WORKING_DIR}/verl/trainer/runtime_env.yaml"} -NNODES=${NNODES:-1} +NNODES=${NNODES:-2} NGPUS_PER_NODE=${NGPUS_PER_NODE:-8} # Paths RAY_DATA_HOME=${RAY_DATA_HOME:-"${HOME}/verl"} @@ -61,7 +61,12 @@ fsdp_size=2 # reference run wandb: https://wandb.ai/verl-org/DAPO%20Reproduction%20on%20verl/runs/ow47vvon?nw=nwusertongyuxuan361 -/home/hadoop-djst-algoplat/miniconda3/bin/python -m verl.trainer.main_ppo \ +PYTHON_INTERPRETER="/home/hadoop-djst-algoplat/miniconda3/bin/python" +if [ ! -x "$PYTHON_INTERPRETER" ]; then + PYTHON_INTERPRETER="python3" +fi + +$PYTHON_INTERPRETER -m verl.trainer.main_ppo \ data.train_files="${TRAIN_FILE}" \ data.val_files="${TEST_FILE}" \ data.prompt_key=prompt \ @@ -127,10 +132,10 @@ fsdp_size=2 trainer.n_gpus_per_node="${NGPUS_PER_NODE}" \ trainer.nnodes="${NNODES}" \ trainer.val_before_train=True \ - trainer.test_freq=-1 \ + trainer.test_freq=10 \ trainer.save_freq=-1 \ trainer.total_epochs=10 \ - trainer.total_training_steps=10 \ + trainer.total_training_steps=100 \ trainer.default_local_dir="${CKPTS_DIR}" \ trainer.resume_mode=auto \ trainer.log_val_generations=10 diff --git a/recipe/fully_async_policy/detach_utils.py b/recipe/fully_async_policy/detach_utils.py index 3ac998bc82a..e3371e59acc 100644 --- a/recipe/fully_async_policy/detach_utils.py +++ b/recipe/fully_async_policy/detach_utils.py @@ -13,7 +13,7 @@ # limitations under the License. import time from dataclasses import dataclass -from typing import Any +from typing import Any, List import numpy as np import torch @@ -36,20 +36,18 @@ class RolloutSample: full_batch: Any # AgentLoopOutput from generation - agent_loop_output: Any # AgentLoopOutput + agent_loop_output_list: List[Any] # AgentLoopOutput # Metadata sample_id: str epoch: int - rollout_n_index: int # Index within the rollout.n repetitions (0, 1, ..., n-1) - original_sample_index: int # Index of the original sample before repetition # Processing metadata - processing_time: float + processing_times: List[float] param_version: int -def prepare_single_generation_data(batch_dict, global_steps) -> DataProto: +def prepare_single_generation_data(batch_dict, global_steps, rollout_n) -> DataProto: """ 类似 ray_trainer._prepare_generate_batch 的逻辑,但针对单个样本 分离出用于生成的数据和需要保留的原始数据 @@ -81,10 +79,35 @@ def prepare_single_generation_data(batch_dict, global_steps) -> DataProto: # 添加全局步数到生成数据 full_batch.meta_info["global_steps"] = global_steps - + full_batch = full_batch.repeat(repeat_times=rollout_n, interleave=True) return full_batch +def merge_rollout_sample(config, tokenizer, rs: RolloutSample): + # 第一步:从 AgentLoopOutput 创建生成结果的 DataProto + gen_batch_output = postprocess_agent_loop_outputs(rs.agent_loop_output_list, tokenizer, config) + + # 第二步:添加 uid + rs.full_batch.non_tensor_batch["uid"] = np.array([f"uid_{rs.sample_id}"] * len(rs.full_batch), dtype=object) + + # 第二步:合并batch + # 将 original_batch 的 non_tensor_batch 和 meta_info 合并到 final_batch + for key, value in rs.full_batch.non_tensor_batch.items(): + gen_batch_output.non_tensor_batch[key] = value + gen_batch_output.meta_info.update(rs.full_batch.meta_info) + + # 第三步,设置 full_batch + rs.full_batch = gen_batch_output + rs.processing_times = [] + for agent_loop in rs.agent_loop_output_list: + rs.processing_times.append(agent_loop.metrics.generate_sequences) + + # 第四步,清空 agent_loop_output_list + rs.agent_loop_output_list = [] + + return rs + + def assemble_batch_from_rollout_samples( rollout_samples: list[RolloutSample], tokenizer, config, balance_batch=None ) -> DataProto: @@ -111,47 +134,13 @@ def assemble_batch_from_rollout_samples( print(f"[BatchUtils] Assembling batch from {len(rollout_samples)} RolloutSample objects") - # 直接处理 RolloutSample 对象 - processing_times = [rs.processing_time for rs in rollout_samples] - - # 第一步:从 AgentLoopOutput 创建生成结果的 DataProto - agent_loop_outputs = [rs.agent_loop_output for rs in rollout_samples] - gen_batch_output = postprocess_agent_loop_outputs(agent_loop_outputs, tokenizer, config) - - # 第二步:重建原始 batch 信息 - # 每个 RolloutSample 都是独立的,直接按顺序重建原始数据 - original_batch_list = [] - for rs in rollout_samples: - item = rs.full_batch.to_items()[0] - original_batch_list.append(item) - - # print("=" * 300) - # print(original_batch_list) - - # 合并所有原始样本为一个批次 - if original_batch_list: - original_batch = DataProto.from_items(original_batch_list) - else: - # 如果没有原始数据,创建空的 DataProto - original_batch = DataProto.from_single_dict({}) - - # print("=" * 300) - # print(original_batch) + rollout_samples_batch = [] + processing_times = [] - # 添加 UID - uids = [] for rs in rollout_samples: - uids.append(f"uid_{rs.sample_id}") - original_batch.non_tensor_batch["uid"] = np.array(uids, dtype=object) - - # 直接合并原始数据和生成结果,不需要 repeat - # 因为队列中的每个 RolloutSample 都已经是独立的样本 - if original_batch.batch is None: - final_batch = gen_batch_output - # 将 original_batch 的 non_tensor_batch 和 meta_info 合并到 final_batch - for key, value in original_batch.non_tensor_batch.items(): - final_batch.non_tensor_batch[key] = value - final_batch.meta_info.update(original_batch.meta_info) + rollout_samples_batch.append(rs.full_batch) + processing_times.extend(rs.processing_times) + final_batch = DataProto.concat(rollout_samples_batch) # 计算 response_mask(如果不存在) if "response_mask" not in final_batch.batch.keys(): diff --git a/recipe/fully_async_policy/fully_async_main.py b/recipe/fully_async_policy/fully_async_main.py index cce57501a3b..c86ade05301 100644 --- a/recipe/fully_async_policy/fully_async_main.py +++ b/recipe/fully_async_policy/fully_async_main.py @@ -273,6 +273,7 @@ def main(config): if not hasattr(config, "async_training"): raise RuntimeError("must set async_training config") from time import time + start_time = time() run_ppo(config, task_runner_class=FullyAsyncTaskRunner) print(f"total time: {time() - start_time:.2f} seconds") diff --git a/recipe/fully_async_policy/fully_async_rollouter.py b/recipe/fully_async_policy/fully_async_rollouter.py index 0982954db75..e846f13c3be 100644 --- a/recipe/fully_async_policy/fully_async_rollouter.py +++ b/recipe/fully_async_policy/fully_async_rollouter.py @@ -13,7 +13,7 @@ # limitations under the License. import asyncio import time -from pprint import pprint +from pprint import pformat import ray from omegaconf import OmegaConf @@ -23,6 +23,7 @@ RolloutSample, calculate_one_step_size, prepare_single_generation_data, + merge_rollout_sample, ) from recipe.fully_async_policy.message_queue import MessageQueueClient from verl.single_controller.ray import RayClassWithInitArgs, RayWorkerGroup @@ -63,7 +64,9 @@ def __init__( assert self.config.data.train_batch_size == 0, "train_batch_size must be zero" assert self.config.data.gen_batch_size == 1, "gen_batch_size must be one" assert self.config.async_training.staleness_threshold >= 0, "staleness_threshold must larger than 0" - assert self.config.async_training.trigger_parameter_sync_step >= 1, "trigger_parameter_sync_step must larger than 1" + assert self.config.async_training.trigger_parameter_sync_step >= 1, ( + "trigger_parameter_sync_step must larger than 1" + ) self.role_worker_mapping = role_worker_mapping self.resource_pool_manager = resource_pool_manager @@ -149,7 +152,6 @@ def __init__( self.max_concurrent_samples = self.required_samples # 流式处理统计 - self.max_processing_time = 0.0 # 最长处理时间 self.processed_sample_count = 0 # 已处理的样本计数 self.active_sample_count = 0 # 当前正在处理的样本数 self.queue_full_pause_count = 0 # 队列满导致的暂停次数 @@ -243,54 +245,39 @@ def _init_async_rollout_manager(self): async def _feed_samples(self): continuous_iterator = self._create_continuous_iterator() sample_count = 0 - should_stop = False - - progress_bar = tqdm( - total=self.total_rollout_steps / self.required_samples, initial=self.global_steps, desc="Training Progress" - ) for epoch, batch_dict in continuous_iterator: - if should_stop: # 检查停止标志 - break - # 类似 _prepare_generate_batch 的逻辑:分离数据 - full_batch = prepare_single_generation_data(batch_dict, self.global_steps) - - # 根据 rollout.n 进行重复 - for rollout_n_index in range(self.config.actor_rollout_ref.rollout.n): - sample_id = f"sample_{epoch}_{sample_count}_{rollout_n_index}" - - rollout_sample = RolloutSample( - full_batch=full_batch, - agent_loop_output=None, # 待处理后填充 - sample_id=sample_id, - epoch=epoch, - rollout_n_index=rollout_n_index, - original_sample_index=sample_count, - processing_time=0.0, # 待处理后填充 - param_version=0, # 待处理后填充 - ) + full_batch = prepare_single_generation_data( + batch_dict, self.global_steps, self.config.actor_rollout_ref.rollout.n + ) - await self.pending_queue.put(rollout_sample) + sample_id = f"sample_{epoch}_{sample_count}" - # 检查是否到达最后一步 - if self.global_steps >= self.total_rollout_steps: - print( - f"[FullyAsyncRollouter][Feed] " - f"达到最大步数,停止添加新样本 " - f"{self.global_steps} >= {self.total_rollout_steps}" - ) - should_stop = True # 设置停止标志 - break + rollout_sample = RolloutSample( + full_batch=full_batch, + agent_loop_output_list=[None] * self.config.actor_rollout_ref.rollout.n, # 待处理后填充 + sample_id=sample_id, + epoch=epoch, + param_version=0, # 待处理后填充 + processing_times=[], + ) + + await self.pending_queue.put(rollout_sample) - if self.global_steps % self.required_samples == 0: - progress_bar.update(1) - self.global_steps += 1 + # 检查是否到达最后一步 + if self.global_steps >= self.total_rollout_steps: + print( + f"[FullyAsyncRollouter][Feed] " + f"达到最大步数,停止添加新样本 " + f"{self.global_steps} >= {self.total_rollout_steps}" + ) + break + self.global_steps += 1 sample_count += 1 # 发送结束信号 - progress_bar.close() await self.pending_queue.put("DONE") print(f"[FullyAsyncRollouter][Feed] 样本添加完成,总共添加了 {self.global_steps} 个步骤的样本") @@ -369,22 +356,29 @@ async def _processor_worker(self): async def _process_single_sample_streaming(self, rollout_sample: RolloutSample): """流式处理单个样本""" # 调用异步生成方法 - agent_loop_output, processing_time = await self.async_rollout_manager.generate_single_sample_async( - rollout_sample.full_batch, rollout_sample.agent_loop_output + agent_loop_output_list = await self.async_rollout_manager.generate_single_sample_async( + rollout_sample.full_batch, rollout_sample.agent_loop_output_list ) # 直接更新 RolloutSample 对象,填充剩余字段 - rollout_sample.agent_loop_output = agent_loop_output - rollout_sample.processing_time += processing_time + rollout_sample.agent_loop_output_list = agent_loop_output_list rollout_sample.param_version = self.current_param_version - # print( - # f"[FullyAsyncRollouter] rollout {rollout_sample.sample_id} " - # f"cost {processing_time:.2f}s " - # f"len: {len(rollout_sample.agent_loop_output.response_ids)} " - # f"cancel: {agent_loop_output.is_cancel} " - # ) + is_cancel = False + # 收集所有信息 + for agent_loop in agent_loop_output_list: + if is_cancel == False and agent_loop.is_cancel: + is_cancel = True - if agent_loop_output.is_cancel: + rollout_data = { + "cost": [f"{agent_loop.metrics.generate_sequences:.2f}s" for agent_loop in agent_loop_output_list], + "len": [len(agent_loop.response_ids) for agent_loop in agent_loop_output_list], + } + if is_cancel: + rollout_data["cancel"] = [agent_loop.is_cancel for agent_loop in agent_loop_output_list] + formatted_data = pformat(rollout_data, width=200, compact=True) + print(f"[FullyAsyncRollouter] rollout {rollout_sample.sample_id} {formatted_data}") + + if is_cancel: # 放入 cancel 队列中,等待恢复生成 await self.cancel_queue.put(rollout_sample) else: @@ -392,15 +386,14 @@ async def _process_single_sample_streaming(self, rollout_sample: RolloutSample): await self.result_queue.put(rollout_sample) self.processed_sample_count += 1 - # 更新最大处理时间统计 - if processing_time > self.max_processing_time: - self.max_processing_time = processing_time async def _consumer_worker(self): """消费者协程,负责从结果队列获取处理结果并放入消息队列""" while True: # 从结果队列获取 RolloutSample rollout_sample = await self.result_queue.get() + rollout_sample = merge_rollout_sample(self.config, self.tokenizer, rollout_sample) + # 直接将 RolloutSample 放入消息队列 success = await self.message_queue_client.put_sample( sample=ray.cloudpickle.dumps(rollout_sample), @@ -411,8 +404,6 @@ async def _consumer_worker(self): else: self.dropped_stale_samples += 1 - # print(f"[FullyAsyncRollouter] submit {rollout_sample.sample_id} {'success' if success else 'error'}") - # 标记结果队列任务完成 self.result_queue.task_done() @@ -437,7 +428,7 @@ async def _streaming_generation_main(self): if self.val_reward_fn is not None and self.config.trainer.get("val_before_train", True): val_metrics = self._validate() assert val_metrics, f"{val_metrics=}" - pprint(f"[FullyAsyncRollouter] Initial validation metrics: {val_metrics}") + print(f"[FullyAsyncRollouter] Initial validation metrics: \n {pformat(val_metrics)}") self.logger.log(data=val_metrics, step=self.global_steps) if self.config.trainer.get("val_only", False): return @@ -550,8 +541,7 @@ async def _async_monitor_loop(self): current_time = time.time() if current_time - last_stats_time >= stats_interval: stats = await self.get_statistics() - print("[FullyAsyncRollouter][MonitorLoop][Statistics]") - pprint(stats) + print(f"[FullyAsyncRollouter][MonitorLoop][Statistics] {pformat(stats)}") last_stats_time = current_time # pause 和 resume 之间,不进行恢复操作 diff --git a/recipe/fully_async_policy/unittest/test_batch_utils.py b/recipe/fully_async_policy/unittest/test_batch_utils.py index b9351c46c28..363423b589d 100644 --- a/recipe/fully_async_policy/unittest/test_batch_utils.py +++ b/recipe/fully_async_policy/unittest/test_batch_utils.py @@ -128,7 +128,7 @@ def create_mock_rollout_sample(self, sample_id: str, param_version: int = 1) -> return RolloutSample( full_batch=mock_gen_data, - agent_loop_output=agent_loop_output, + agent_loop_output_list=agent_loop_output, sample_id=sample_id, epoch=0, rollout_n_index=0, diff --git a/tests/special_e2e/run_fully_async_policy.sh b/tests/special_e2e/run_fully_async_policy.sh index 8e0b82ddefc..ac08af80928 100644 --- a/tests/special_e2e/run_fully_async_policy.sh +++ b/tests/special_e2e/run_fully_async_policy.sh @@ -49,15 +49,17 @@ top_k=-1 val_top_p=0.7 # Fully async specific parameters -n_gpus_rollout=6 +n_gpus_rollout=4 n_gpus_training=$((NUM_GPUS - n_gpus_rollout)) train_prompt_bsz=0 gen_prompt_bsz=1 -n_resp_per_prompt=3 -train_prompt_mini_bsz=32 -total_rollout_steps=50000 -staleness_threshold=10 +n_resp_per_prompt=16 +train_prompt_mini_bsz=4 +staleness_threshold=1 +total_rollout_steps=$(((128*2))) +trigger_parameter_sync_step=4 +partial_rollout=True exp_name="$(basename "${MODEL_ID,,}")-fully-async-policy-${ACTOR_STRATEGY}-minimal" @@ -126,14 +128,16 @@ common_params=( rollout.total_epochs=2 # Fully async specific configurations async_training.staleness_threshold=${staleness_threshold} + async_training.partial_rollout="${partial_rollout}" + async_training.trigger_parameter_sync_step="${trigger_parameter_sync_step}" ) if [ "${ACTOR_STRATEGY}" == "fsdp2" ]; then echo "Running fully async training with FSDP2 strategy..." # FSDP2 specific parameters - gen_tp=2 - sp_size=2 - fsdp_size=2 + gen_tp=1 + sp_size=1 + fsdp_size=1 ref_offload=True actor_offload=False diff --git a/verl/experimental/agent_loop/agent_loop.py b/verl/experimental/agent_loop/agent_loop.py index 83ba95c8662..6899937443f 100644 --- a/verl/experimental/agent_loop/agent_loop.py +++ b/verl/experimental/agent_loop/agent_loop.py @@ -18,7 +18,7 @@ import random import time from abc import ABC, abstractmethod -from typing import Any, Optional +from typing import Any, Optional, List import hydra import numpy as np @@ -383,13 +383,14 @@ async def generate_sequences(self, batch: DataProto) -> DataProto: return output async def generate_sequences_no_post( - self, batch: DataProto, partial_output: Optional[AgentLoopOutput] + self, + batch: DataProto, partial_output_list: Optional[List[AgentLoopOutput]] ) -> list[AgentLoopOutput]: """Generate sequences from agent loop. Args: batch (DataProto): Input batch. - partial_output: Optional[AgentLoopOutput]: already rollout result. + partial_output_list: Optional[List[AgentLoopOutput]]: already rollout result. Returns: list[AgentLoopOutput]: List of agent loop outputs, one per sample in the batch. @@ -427,8 +428,14 @@ async def generate_sequences_no_post( trajectory_info = await get_trajectory_info( batch.meta_info.get("global_steps", -1), index, batch.meta_info.get("validate", False) ) - - for agent_name, messages, trajectory in zip(agent_names, raw_prompts, trajectory_info, strict=True): + if not partial_output_list: + partial_output_list = [None] * len(batch) + + for agent_name, messages, trajectory, partial_output in zip(agent_names, + raw_prompts, + trajectory_info, + partial_output_list, + strict=True): tasks.append( asyncio.create_task( self._run_agent_loop(agent_name, messages.tolist(), sampling_params, trajectory, partial_output) @@ -602,38 +609,25 @@ def generate_sequences(self, prompts: DataProto) -> DataProto: output.meta_info = {"timing": timing} return output - async def generate_single_sample_async( - self, sample: DataProto, partial_output: Optional[AgentLoopOutput] - ) -> tuple[AgentLoopOutput, float]: + async def generate_single_sample_async(self, + sample: DataProto, + partial_output_list: Optional[List[AgentLoopOutput]], + ) -> List[AgentLoopOutput]: """ - 异步处理单个样本 - 用于流式推理的核心方法 + 异步处理单个样本, 需要复制n次 Args: sample: 单个样本数据 - partial_output: Optional[AgentLoopOutput]: already rollout result. + partial_output_list: Optional[List[AgentLoopOutput]]: already rollout result. Returns: tuple[AgentLoopOutput, float]: 处理结果和处理时间 """ - start_time = time.time() - # 使用负载均衡选择 worker worker = self._select_best_worker() - # 异步处理单个样本 - 使用无后处理版本获取原始AgentLoopOutput - output_future = worker.generate_sequences_no_post.remote(sample, partial_output) - outputs = await asyncio.wrap_future(output_future.future()) - - processing_time = time.time() - start_time - - # outputs 是 AgentLoopOutput 列表,取第一个(因为是单样本) - assert len(outputs) == 1, f"Expected single output for single sample, got {len(outputs)}" - output = outputs[0] - - # 添加处理时间到metrics - output.metrics.generate_sequences = processing_time - - return output, processing_time + output_future = worker.generate_sequences_no_post.remote(sample, partial_output_list) + return await asyncio.wrap_future(output_future.future()) def _select_best_worker(self): """选择最佳的 worker(简单的轮询负载均衡)""" diff --git a/verl/experimental/agent_loop/partial_single_turn_agent_loop.py b/verl/experimental/agent_loop/partial_single_turn_agent_loop.py index fd2a7292e67..899b83f1866 100644 --- a/verl/experimental/agent_loop/partial_single_turn_agent_loop.py +++ b/verl/experimental/agent_loop/partial_single_turn_agent_loop.py @@ -35,13 +35,18 @@ def __init__(self, *args, **kwargs): async def run( self, messages: list[dict[str, Any]], sampling_params: dict[str, Any], output: Optional[AgentLoopOutput] ) -> AgentLoopOutput: + if not output: prompt_ids = await self.loop.run_in_executor( None, lambda: self.tokenizer.apply_chat_template(messages, add_generation_prompt=True, tokenize=True) ) else: - # 恢复暂停的样本,结果直接添加到 prompt_ids 后面 - prompt_ids = output.prompt_ids + output.response_ids + if output.is_cancel: + # 恢复暂停的样本,结果直接添加到 prompt_ids 后面 + prompt_ids = output.prompt_ids + output.response_ids + else: + # 同一批样本,部分cancel,部分没有cancel, 没有cancel的样本直接返回 + return output metrics = {} request_id = uuid4().hex From 7d6505432e2bf640c732a2b85b69d84ea8eee4c3 Mon Sep 17 00:00:00 2001 From: ArronHZG Date: Wed, 27 Aug 2025 17:07:35 +0800 Subject: [PATCH 076/182] fix oom --- recipe/fully_async_policy/dapo_7b_math_fsdp2_colocate.sh | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/recipe/fully_async_policy/dapo_7b_math_fsdp2_colocate.sh b/recipe/fully_async_policy/dapo_7b_math_fsdp2_colocate.sh index 951db892651..33f9836e095 100644 --- a/recipe/fully_async_policy/dapo_7b_math_fsdp2_colocate.sh +++ b/recipe/fully_async_policy/dapo_7b_math_fsdp2_colocate.sh @@ -55,8 +55,8 @@ use_dynamic_bsz=True actor_ppo_max_token_len=$(((max_prompt_length + max_response_length) * 2)) infer_ppo_max_token_len=$(((max_prompt_length + max_response_length) * 3)) offload=True -gen_tp=1 -sp_size=1 +gen_tp=2 +sp_size=4 fsdp_size=2 # reference run wandb: https://wandb.ai/verl-org/DAPO%20Reproduction%20on%20verl/runs/ow47vvon?nw=nwusertongyuxuan361 From 57076bc6c509edfb4bfee2e2b28f914278a7041f Mon Sep 17 00:00:00 2001 From: wangshulin02 Date: Thu, 28 Aug 2025 09:27:19 +0800 Subject: [PATCH 077/182] fix validation bug --- recipe/fully_async_policy/fully_async_main.py | 2 ++ recipe/fully_async_policy/fully_async_rollouter.py | 12 +++++++----- 2 files changed, 9 insertions(+), 5 deletions(-) diff --git a/recipe/fully_async_policy/fully_async_main.py b/recipe/fully_async_policy/fully_async_main.py index c86ade05301..e662aec23bf 100644 --- a/recipe/fully_async_policy/fully_async_main.py +++ b/recipe/fully_async_policy/fully_async_main.py @@ -221,6 +221,8 @@ def _create_rollouter(self, config) -> None: resource_pool_manager=create_resource_pool_manager(config, roles=[Role.Rollout]), ray_worker_group_cls=self.components["ray_worker_group_cls"], processor=self.components["processor"], + reward_fn=self.components["reward_fn"], + val_reward_fn=self.components["val_reward_fn"], device_name=config.trainer.device, ) diff --git a/recipe/fully_async_policy/fully_async_rollouter.py b/recipe/fully_async_policy/fully_async_rollouter.py index 262dd622e1b..04b2fe5dc54 100644 --- a/recipe/fully_async_policy/fully_async_rollouter.py +++ b/recipe/fully_async_policy/fully_async_rollouter.py @@ -13,7 +13,7 @@ # limitations under the License. import asyncio import time -from pprint import pformat +from pprint import pformat, pprint import ray from omegaconf import OmegaConf @@ -210,9 +210,11 @@ async def update_param_version(self, version: int): ) timing_raw = {} self.update_param_version_time += 1 + is_last_step = self.global_steps >= self.total_training_steps if (self.val_reward_fn is not None and self.config.trainer.test_freq > 0 - and (self.is_last_step or self.global_steps % self.config.trainer.test_freq == 0)): + and ((self.global_steps > 0 and self.global_steps % self.config.trainer.test_freq == 0) + or is_last_step)): with marked_timer("testing", timing_raw, color="green"): val_metrics: dict = self._validate() data = ValidateMetrics(timing_raw=timing_raw, metrics=val_metrics) @@ -438,12 +440,12 @@ async def _streaming_generation_main(self): config=OmegaConf.to_container(self.config, resolve=True), ) - # load checkpoint before doing anything - self._load_checkpoint() + # load checkpoint before doing anything + self._load_checkpoint() # TODO: 检查是否需要 # perform validation before training # currently, we only support validation using the reward_function. - async with self.lock: + async with self.lock: # TODO: 检查是否需要锁 if self.val_reward_fn is not None and self.config.trainer.get("val_before_train", True): print("Initial validation metric") val_metrics = self._validate() From a7133c94f1d084cca76c2f86f059a357d220f0da Mon Sep 17 00:00:00 2001 From: ArronHZG Date: Thu, 28 Aug 2025 13:34:05 +0800 Subject: [PATCH 078/182] fsdp2 8 8 --- .../dapo_7b_math_fsdp2_4_4.sh | 2 +- .../dapo_7b_math_fsdp2_8_8.sh | 170 ++++++++++++++++++ recipe/fully_async_policy/runtime_env.yaml | 2 + 3 files changed, 173 insertions(+), 1 deletion(-) create mode 100644 recipe/fully_async_policy/dapo_7b_math_fsdp2_8_8.sh create mode 100644 recipe/fully_async_policy/runtime_env.yaml diff --git a/recipe/fully_async_policy/dapo_7b_math_fsdp2_4_4.sh b/recipe/fully_async_policy/dapo_7b_math_fsdp2_4_4.sh index 289a3556871..5fb85a66b6f 100644 --- a/recipe/fully_async_policy/dapo_7b_math_fsdp2_4_4.sh +++ b/recipe/fully_async_policy/dapo_7b_math_fsdp2_4_4.sh @@ -155,7 +155,7 @@ $PYTHON_INTERPRETER -m recipe.fully_async_policy.fully_async_main \ trainer.project_name="${project_name}" \ trainer.experiment_name="${exp_name}" \ trainer.val_before_train=True \ - trainer.test_freq=-1 \ + trainer.test_freq="${test_freq}" \ trainer.save_freq=-1 \ trainer.default_local_dir="${CKPTS_DIR}" \ trainer.resume_mode=auto \ diff --git a/recipe/fully_async_policy/dapo_7b_math_fsdp2_8_8.sh b/recipe/fully_async_policy/dapo_7b_math_fsdp2_8_8.sh new file mode 100644 index 00000000000..30086d3fe30 --- /dev/null +++ b/recipe/fully_async_policy/dapo_7b_math_fsdp2_8_8.sh @@ -0,0 +1,170 @@ +#!/usr/bin/env bash +set -xeuo pipefail + +project_name='DAPO' +exp_name='DAPO-Qwen2.5-7b-MATH-0527a1-fsdp2-fully-async-8-8' + +# Ray +# RAY_ADDRESS=${RAY_ADDRESS:-"http://localhost:8265"} +# WORKING_DIR=${WORKING_DIR:-"${PWD}"} +# RUNTIME_ENV=${RUNTIME_ENV:-"${WORKING_DIR}/verl/trainer/runtime_env.yaml"} +# Paths +RAY_DATA_HOME=${RAY_DATA_HOME:-"${HOME}/verl"} +# very important! please modify the max_position_embeddings in config.json to 32768 after downloading from huggingface +MODEL_PATH=${MODEL_PATH:-"${RAY_DATA_HOME}/models/Qwen2.5-Math-7B"} +CKPTS_DIR=${CKPTS_DIR:-"${RAY_DATA_HOME}/ckpts/${project_name}/${exp_name}"} +TRAIN_FILE=${TRAIN_FILE:-"${RAY_DATA_HOME}/data/dapo-math-17k.parquet"} +TEST_FILE=${TEST_FILE:-"${RAY_DATA_HOME}/data/aime-2024.parquet"} + +MODEL_PATH=/mnt/dolphinfs/hdd_pool/docker/user/hadoop-djst-algoplat/houzhenggang/modelscope/hub/models/Qwen/Qwen2___5-Math-7B +CKPTS_DIR=./ckpts/${project_name}/${exp_name} +TRAIN_FILE=/mnt/dolphinfs/hdd_pool/docker/user/hadoop-djst-algoplat/houzhenggang/data/dapo/dapo-math-17k.parquet +TEST_FILE=/mnt/dolphinfs/hdd_pool/docker/user/hadoop-djst-algoplat/houzhenggang/data/dapo/aime-2024.parquet + +rollout_mode="async" +rollout_name="vllm" # sglang or vllm +if [ "$rollout_mode" = "async" ]; then + export VLLM_USE_V1=1 + return_raw_chat="True" +fi + +# Algorithm parameters +adv_estimator=grpo + +use_kl_in_reward=False +kl_coef=0.0 +use_kl_loss=False +kl_loss_coef=0.0 + +clip_ratio_low=0.2 +clip_ratio_high=0.28 + +# Response length parameters +max_prompt_length=$((1024 * 2)) +max_response_length=$((1024 * 8)) +enable_overlong_buffer=True +overlong_buffer_len=$((1024 * 4)) +overlong_penalty_factor=1.0 + +# Training parameters +loss_agg_mode="token-mean" + +# Algorithm +temperature=1.0 +top_p=1.0 +top_k=-1 # 0 for HF rollout, -1 for vLLM rollout +val_top_p=0.7 + +# Performance Related Parameter +use_dynamic_bsz=True +actor_ppo_max_token_len=$(((max_prompt_length + max_response_length) * 2)) +infer_ppo_max_token_len=$(((max_prompt_length + max_response_length) * 3)) +ref_offload=True +actor_offload=False +gen_tp=1 +sp_size=1 +fsdp_size=2 + +NNODES=${NNODES:-2} +NGPUS_PER_NODE=${NGPUS_PER_NODE:-8} + +# Fully async specific parameters +n_gpus_rollout=4 +n_gpus_training=$((NGPUS_PER_NODE - n_gpus_rollout)) + +train_prompt_bsz=0 +gen_prompt_bsz=1 +n_resp_per_prompt=16 +train_prompt_mini_bsz=4 +total_rollout_steps=$(((512*100))) +test_freq=10 +staleness_threshold=1 +trigger_parameter_sync_step=16 +partial_rollout=True + +PYTHON_INTERPRETER="/home/hadoop-djst-algoplat/miniconda3/bin/python" +if [ ! -x "$PYTHON_INTERPRETER" ]; then + PYTHON_INTERPRETER="python3" +fi + +$PYTHON_INTERPRETER -m recipe.fully_async_policy.fully_async_main \ + data.train_files="${TRAIN_FILE}" \ + data.val_files="${TEST_FILE}" \ + data.prompt_key=prompt \ + data.truncation='left' \ + data.max_prompt_length=${max_prompt_length} \ + data.max_response_length=${max_response_length} \ + data.train_batch_size=${train_prompt_bsz} \ + data.gen_batch_size=${gen_prompt_bsz} \ + data.return_raw_chat=${return_raw_chat} \ + actor_rollout_ref.rollout.n=${n_resp_per_prompt} \ + algorithm.adv_estimator=${adv_estimator} \ + algorithm.use_kl_in_reward=${use_kl_in_reward} \ + algorithm.kl_ctrl.kl_coef=${kl_coef} \ + actor_rollout_ref.actor.strategy=fsdp2 \ + critic.strategy=fsdp2 \ + actor_rollout_ref.actor.use_kl_loss=${use_kl_loss} \ + actor_rollout_ref.actor.kl_loss_coef=${kl_loss_coef} \ + actor_rollout_ref.actor.clip_ratio_low=${clip_ratio_low} \ + actor_rollout_ref.actor.clip_ratio_high=${clip_ratio_high} \ + actor_rollout_ref.actor.clip_ratio_c=10.0 \ + actor_rollout_ref.model.use_remove_padding=True \ + actor_rollout_ref.hybrid_engine=False \ + +actor_rollout_ref.model.override_config.max_position_embeddings=32768 \ + actor_rollout_ref.actor.use_dynamic_bsz=${use_dynamic_bsz} \ + actor_rollout_ref.ref.log_prob_use_dynamic_bsz=${use_dynamic_bsz} \ + actor_rollout_ref.rollout.log_prob_use_dynamic_bsz=${use_dynamic_bsz} \ + actor_rollout_ref.actor.ppo_max_token_len_per_gpu=${actor_ppo_max_token_len} \ + actor_rollout_ref.ref.log_prob_max_token_len_per_gpu=${infer_ppo_max_token_len} \ + actor_rollout_ref.rollout.log_prob_max_token_len_per_gpu=${infer_ppo_max_token_len} \ + actor_rollout_ref.model.path="${MODEL_PATH}" \ + actor_rollout_ref.actor.optim.lr=1e-6 \ + actor_rollout_ref.actor.optim.lr_warmup_steps=10 \ + actor_rollout_ref.actor.optim.weight_decay=0.1 \ + actor_rollout_ref.actor.ppo_mini_batch_size=${train_prompt_mini_bsz} \ + actor_rollout_ref.actor.fsdp_config.param_offload=${actor_offload} \ + actor_rollout_ref.actor.fsdp_config.optimizer_offload=${actor_offload} \ + actor_rollout_ref.actor.entropy_coeff=0 \ + actor_rollout_ref.actor.grad_clip=1.0 \ + actor_rollout_ref.actor.loss_agg_mode=${loss_agg_mode} \ + actor_rollout_ref.actor.ulysses_sequence_parallel_size=${sp_size} \ + actor_rollout_ref.rollout.gpu_memory_utilization=0.80 \ + actor_rollout_ref.rollout.tensor_model_parallel_size=${gen_tp} \ + actor_rollout_ref.rollout.enable_chunked_prefill=True \ + actor_rollout_ref.rollout.max_num_batched_tokens=$((max_prompt_length + max_response_length)) \ + actor_rollout_ref.rollout.temperature=${temperature} \ + actor_rollout_ref.rollout.top_p=${top_p} \ + actor_rollout_ref.rollout.top_k=${top_k} \ + actor_rollout_ref.rollout.val_kwargs.temperature=${temperature} \ + actor_rollout_ref.rollout.val_kwargs.top_p=${val_top_p} \ + actor_rollout_ref.rollout.val_kwargs.top_k=${top_k} \ + actor_rollout_ref.rollout.val_kwargs.do_sample=True \ + actor_rollout_ref.rollout.val_kwargs.n=1 \ + actor_rollout_ref.ref.fsdp_config.param_offload=${ref_offload} \ + actor_rollout_ref.ref.ulysses_sequence_parallel_size=${sp_size} \ + actor_rollout_ref.actor.fsdp_config.fsdp_size=${fsdp_size} \ + actor_rollout_ref.rollout.name=${rollout_name} \ + actor_rollout_ref.rollout.mode=${rollout_mode} \ + reward_model.reward_manager=dapo \ + +reward_model.reward_kwargs.overlong_buffer_cfg.enable=${enable_overlong_buffer} \ + +reward_model.reward_kwargs.overlong_buffer_cfg.len=${overlong_buffer_len} \ + +reward_model.reward_kwargs.overlong_buffer_cfg.penalty_factor=${overlong_penalty_factor} \ + +reward_model.reward_kwargs.overlong_buffer_cfg.log=False \ + +reward_model.reward_kwargs.max_resp_len=${max_response_length} \ + trainer.logger=['console','tensorboard'] \ + trainer.project_name="${project_name}" \ + trainer.experiment_name="${exp_name}" \ + trainer.val_before_train=True \ + trainer.test_freq="${test_freq}" \ + trainer.save_freq=-1 \ + trainer.default_local_dir="${CKPTS_DIR}" \ + trainer.resume_mode=auto \ + trainer.nnodes="${NNODES}" \ + trainer.n_gpus_per_node="${n_gpus_training}" \ + rollout.nnodes="${NNODES}" \ + rollout.n_gpus_per_node="${n_gpus_rollout}" \ + rollout.total_rollout_steps="${total_rollout_steps}" \ + rollout.total_epochs=10 \ + async_training.staleness_threshold="${staleness_threshold}" \ + async_training.trigger_parameter_sync_step="${trigger_parameter_sync_step}" \ + async_training.partial_rollout="${partial_rollout}" diff --git a/recipe/fully_async_policy/runtime_env.yaml b/recipe/fully_async_policy/runtime_env.yaml new file mode 100644 index 00000000000..81c7c9f4265 --- /dev/null +++ b/recipe/fully_async_policy/runtime_env.yaml @@ -0,0 +1,2 @@ +env_vars: + VLLM_USE_V1: "1" \ No newline at end of file From d3216d2b2c7e683c70f4b23c5ec4fa800a3bb8a2 Mon Sep 17 00:00:00 2001 From: ArronHZG Date: Thu, 28 Aug 2025 13:44:08 +0800 Subject: [PATCH 079/182] fsdp2_8_8 --- recipe/fully_async_policy/dapo_7b_math_fsdp2_8_8.sh | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/recipe/fully_async_policy/dapo_7b_math_fsdp2_8_8.sh b/recipe/fully_async_policy/dapo_7b_math_fsdp2_8_8.sh index 30086d3fe30..52ee0136d5a 100644 --- a/recipe/fully_async_policy/dapo_7b_math_fsdp2_8_8.sh +++ b/recipe/fully_async_policy/dapo_7b_math_fsdp2_8_8.sh @@ -65,12 +65,12 @@ gen_tp=1 sp_size=1 fsdp_size=2 -NNODES=${NNODES:-2} +NNODES=${NNODES:-1} NGPUS_PER_NODE=${NGPUS_PER_NODE:-8} # Fully async specific parameters -n_gpus_rollout=4 -n_gpus_training=$((NGPUS_PER_NODE - n_gpus_rollout)) +n_gpus_rollout=8 +n_gpus_training=8 train_prompt_bsz=0 gen_prompt_bsz=1 From c33e40ea5f85fd2a63947ca791e205ce5b2da256 Mon Sep 17 00:00:00 2001 From: ArronHZG Date: Thu, 28 Aug 2025 14:31:59 +0800 Subject: [PATCH 080/182] megatron colocate --- .../dapo_7b_math_megatron_colocate.sh | 142 ++++++++++++++++++ 1 file changed, 142 insertions(+) create mode 100644 recipe/fully_async_policy/dapo_7b_math_megatron_colocate.sh diff --git a/recipe/fully_async_policy/dapo_7b_math_megatron_colocate.sh b/recipe/fully_async_policy/dapo_7b_math_megatron_colocate.sh new file mode 100644 index 00000000000..d05f5571876 --- /dev/null +++ b/recipe/fully_async_policy/dapo_7b_math_megatron_colocate.sh @@ -0,0 +1,142 @@ +#!/usr/bin/env bash +set -xeuo pipefail + +project_name='DAPO' +exp_name='DAPO-Qwen2.5-7b-MATH-0519a1-megatron-colocate' + +adv_estimator=grpo + +use_kl_in_reward=False +kl_coef=0.0 +use_kl_loss=False +kl_loss_coef=0.0 + +clip_ratio_low=0.2 +clip_ratio_high=0.28 + +max_prompt_length=$((1024 * 2)) +max_response_length=$((1024 * 8)) +enable_overlong_buffer=True +overlong_buffer_len=$((1024 * 4)) +overlong_penalty_factor=1.0 + +loss_agg_mode="token-mean" + +train_prompt_bsz=512 +n_resp_per_prompt=16 +train_prompt_mini_bsz=32 + +# Ray +# RAY_ADDRESS=${RAY_ADDRESS:-"http://localhost:8265"} +# WORKING_DIR=${WORKING_DIR:-"${PWD}"} +# RUNTIME_ENV=${RUNTIME_ENV:-"${WORKING_DIR}/verl/trainer/runtime_env.yaml"} +NNODES=${NNODES:-2} +NGPUS_PER_NODE=${NGPUS_PER_NODE:-8} +# Paths +RAY_DATA_HOME=${RAY_DATA_HOME:-"${HOME}/verl"} +# very important! please modify the max_position_embeddings in config.json to 32768 after downloading from huggingface +MODEL_PATH=${MODEL_PATH:-"${RAY_DATA_HOME}/models/Qwen2.5-Math-7B"} +CKPTS_DIR=${CKPTS_DIR:-"${RAY_DATA_HOME}/ckpts/${project_name}/${exp_name}"} +TRAIN_FILE=${TRAIN_FILE:-"${RAY_DATA_HOME}/data/dapo-math-17k.parquet"} +TEST_FILE=${TEST_FILE:-"${RAY_DATA_HOME}/data/aime-2024.parquet"} + +MODEL_PATH=/mnt/dolphinfs/hdd_pool/docker/user/hadoop-djst-algoplat/houzhenggang/modelscope/hub/models/Qwen/Qwen2___5-Math-7B +CKPTS_DIR=./ckpts/${project_name}/${exp_name} +TRAIN_FILE=/mnt/dolphinfs/hdd_pool/docker/user/hadoop-djst-algoplat/houzhenggang/data/dapo/dapo-math-17k.parquet +TEST_FILE=/mnt/dolphinfs/hdd_pool/docker/user/hadoop-djst-algoplat/houzhenggang/data/dapo/aime-2024.parquet + +# Algorithm +temperature=1.0 +top_p=1.0 +top_k=-1 # 0 for HF rollout, -1 for vLLM rollout +val_top_p=0.7 + +# Performance Related Parameter +use_dynamic_bsz=True +actor_ppo_max_token_len=$(((max_prompt_length + max_response_length) * 2)) +infer_ppo_max_token_len=$(((max_prompt_length + max_response_length) * 3)) +offload=True +gen_tp=2 +train_tp=2 +train_pp=2 + +# TODO: support dynamic_bsz for megatron +# actor_rollout_ref.actor.use_dynamic_bsz=${use_dynamic_bsz} \ +# actor_rollout_ref.ref.log_prob_use_dynamic_bsz=${use_dynamic_bsz} \ +# actor_rollout_ref.rollout.log_prob_use_dynamic_bsz=${use_dynamic_bsz} \ +# actor_rollout_ref.actor.ppo_max_token_len_per_gpu=${actor_ppo_max_token_len} \ +# actor_rollout_ref.ref.log_prob_max_token_len_per_gpu=${infer_ppo_max_token_len} \ +# actor_rollout_ref.rollout.log_prob_max_token_len_per_gpu=${infer_ppo_max_token_len} \ + +python3 -m verl.trainer.main_ppo \ + --config-path=config \ + --config-name='ppo_megatron_trainer.yaml' \ + data.train_files="${TRAIN_FILE}" \ + data.val_files="${TEST_FILE}" \ + data.prompt_key=prompt \ + data.truncation='left' \ + data.max_prompt_length=${max_prompt_length} \ + data.max_response_length=${max_response_length} \ + data.train_batch_size=${train_prompt_bsz} \ + actor_rollout_ref.rollout.n=${n_resp_per_prompt} \ + algorithm.adv_estimator=${adv_estimator} \ + algorithm.use_kl_in_reward=${use_kl_in_reward} \ + algorithm.kl_ctrl.kl_coef=${kl_coef} \ + actor_rollout_ref.actor.strategy=megatron \ + critic.strategy=megatron \ + actor_rollout_ref.actor.use_kl_loss=${use_kl_loss} \ + actor_rollout_ref.actor.kl_loss_coef=${kl_loss_coef} \ + actor_rollout_ref.actor.clip_ratio_low=${clip_ratio_low} \ + actor_rollout_ref.actor.clip_ratio_high=${clip_ratio_high} \ + actor_rollout_ref.actor.clip_ratio_c=10.0 \ + +actor_rollout_ref.model.override_config.max_position_embeddings=32768 \ + actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=2 \ + actor_rollout_ref.ref.log_prob_micro_batch_size_per_gpu=4 \ + actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu=4 \ + actor_rollout_ref.model.path="${MODEL_PATH}" \ + actor_rollout_ref.actor.optim.lr=1e-6 \ + actor_rollout_ref.actor.optim.lr_warmup_steps=10 \ + actor_rollout_ref.actor.optim.weight_decay=0.1 \ + actor_rollout_ref.actor.ppo_mini_batch_size=${train_prompt_mini_bsz} \ + actor_rollout_ref.actor.megatron.param_offload=${offload} \ + actor_rollout_ref.actor.megatron.optimizer_offload=${offload} \ + actor_rollout_ref.actor.megatron.grad_offload=${offload} \ + actor_rollout_ref.actor.megatron.pipeline_model_parallel_size=${train_pp} \ + actor_rollout_ref.actor.megatron.tensor_model_parallel_size=${train_tp} \ + actor_rollout_ref.actor.entropy_coeff=0 \ + actor_rollout_ref.actor.optim.clip_grad=1.0 \ + actor_rollout_ref.actor.loss_agg_mode=${loss_agg_mode} \ + actor_rollout_ref.rollout.gpu_memory_utilization=0.80 \ + actor_rollout_ref.rollout.tensor_model_parallel_size=${gen_tp} \ + actor_rollout_ref.rollout.enable_chunked_prefill=True \ + actor_rollout_ref.rollout.max_num_batched_tokens=$((max_prompt_length + max_response_length)) \ + actor_rollout_ref.rollout.temperature=${temperature} \ + actor_rollout_ref.rollout.top_p=${top_p} \ + actor_rollout_ref.rollout.top_k=${top_k} \ + actor_rollout_ref.rollout.val_kwargs.temperature=${temperature} \ + actor_rollout_ref.rollout.val_kwargs.top_p=${val_top_p} \ + actor_rollout_ref.rollout.val_kwargs.top_k=${top_k} \ + actor_rollout_ref.rollout.val_kwargs.do_sample=True \ + actor_rollout_ref.rollout.val_kwargs.n=1 \ + actor_rollout_ref.ref.megatron.pipeline_model_parallel_size=${train_pp} \ + actor_rollout_ref.ref.megatron.tensor_model_parallel_size=${train_tp} \ + actor_rollout_ref.ref.megatron.param_offload=${offload} \ + reward_model.reward_manager=dapo \ + +reward_model.reward_kwargs.overlong_buffer_cfg.enable=${enable_overlong_buffer} \ + +reward_model.reward_kwargs.overlong_buffer_cfg.len=${overlong_buffer_len} \ + +reward_model.reward_kwargs.overlong_buffer_cfg.penalty_factor=${overlong_penalty_factor} \ + +reward_model.reward_kwargs.overlong_buffer_cfg.log=False \ + +reward_model.reward_kwargs.max_resp_len=${max_response_length} \ + trainer.logger=['console','tensorboard'] \ + trainer.project_name="${project_name}" \ + trainer.experiment_name="${exp_name}" \ + trainer.n_gpus_per_node=8 \ + trainer.nnodes="${NNODES}" \ + trainer.val_before_train=True \ + trainer.test_freq=10 \ + trainer.save_freq=-1 \ + trainer.total_epochs=10 \ + trainer.total_training_steps=100 \ + trainer.default_local_dir="${CKPTS_DIR}" \ + trainer.resume_mode=auto \ + trainer.log_val_generations=10 From d391a0612722d0e9a8347f953054432866303df0 Mon Sep 17 00:00:00 2001 From: ArronHZG Date: Fri, 29 Aug 2025 17:22:55 +0800 Subject: [PATCH 081/182] rollout log probs --- .../config/fully_async_ppo_trainer.yaml | 1 + recipe/fully_async_policy/detach_utils.py | 39 +++++++++++++++++++ .../fully_async_rollouter.py | 2 +- verl/experimental/agent_loop/agent_loop.py | 2 + .../partial_single_turn_agent_loop.py | 4 +- verl/trainer/ppo/ray_trainer.py | 5 +++ .../rollout/vllm_rollout/vllm_async_server.py | 19 ++++++--- 7 files changed, 65 insertions(+), 7 deletions(-) diff --git a/recipe/fully_async_policy/config/fully_async_ppo_trainer.yaml b/recipe/fully_async_policy/config/fully_async_ppo_trainer.yaml index 0714e107ee4..3334ee4f4d5 100644 --- a/recipe/fully_async_policy/config/fully_async_ppo_trainer.yaml +++ b/recipe/fully_async_policy/config/fully_async_ppo_trainer.yaml @@ -13,6 +13,7 @@ async_training: staleness_threshold: 1 # 样本新鲜度阈值 trigger_parameter_sync_step: 4 # >=1 train 每次训练一个batch, 迭代多少次后触发更新 partial_rollout: True # 同步参数时,是否中断 rollout + use_rollout_log_probs: True # Rollout配置 rollout: diff --git a/recipe/fully_async_policy/detach_utils.py b/recipe/fully_async_policy/detach_utils.py index 0296945a2ab..986cb468932 100644 --- a/recipe/fully_async_policy/detach_utils.py +++ b/recipe/fully_async_policy/detach_utils.py @@ -87,9 +87,48 @@ def prepare_single_generation_data(batch_dict, global_steps, rollout_n) -> DataP return full_batch +def process_rollout_log_probs(data_proto: DataProto, + rollout_log_probs: list[list[float]]) -> torch.Tensor: + """ + 根据 DataProto 中的 mask 逻辑处理 rollout_log_probs + # attention_mask: [0,0,0,0,1,1,1,1, | 1,1,1,0,0,0,0,0] + + Args: + data_proto: 包含 batch 信息的 DataProto 对象 + rollout_log_probs: 二维列表,每个子列表包含一个样本的 log_probs + + Returns: + torch.Tensor: 处理后的 log_probs tensor,形状为 [bsz, response_length] + """ + + batch = data_proto.batch + response_mask = batch["response_mask"] + bsz, response_length = response_mask.shape + + # 初始化结果 tensor + rollout_log_probs_tensor = torch.zeros((bsz, response_length), dtype=torch.float32) - 1 + + + for i, log_probs_seq in enumerate(rollout_log_probs): + # 获取当前样本的有效长度(mask 中为 1 的位置数量) + valid_length = response_mask[i].sum().item() + + # 确保 log_probs_seq 的长度不超过有效长度 + actual_length = min(len(log_probs_seq), valid_length) + + # 将 log_probs 填入对应位置 + if actual_length > 0: + rollout_log_probs_tensor[i, :actual_length] = torch.tensor(log_probs_seq[:actual_length]) + + rollout_log_probs_tensor = rollout_log_probs_tensor.to(torch.float32) + return rollout_log_probs_tensor + def merge_rollout_sample(config, tokenizer, rs: RolloutSample): # 第一步:从 AgentLoopOutput 创建生成结果的 DataProto gen_batch_output = postprocess_agent_loop_outputs(rs.agent_loop_output_list, tokenizer, config) + rollout_log_probs = [x.log_probs for x in rs.agent_loop_output_list] + rollout_log_probs = process_rollout_log_probs(gen_batch_output, rollout_log_probs) + gen_batch_output.batch['rollout_log_probs'] = rollout_log_probs.to(torch.float32) # 第二步:添加 uid rs.full_batch.non_tensor_batch["uid"] = np.array([f"uid_{rs.sample_id}"] * len(rs.full_batch), dtype=object) diff --git a/recipe/fully_async_policy/fully_async_rollouter.py b/recipe/fully_async_policy/fully_async_rollouter.py index 04b2fe5dc54..b2bcebede58 100644 --- a/recipe/fully_async_policy/fully_async_rollouter.py +++ b/recipe/fully_async_policy/fully_async_rollouter.py @@ -226,7 +226,7 @@ def _validate_config(self): # Validate asynchronous training configuration if not hasattr(self.config, "async_training"): raise ValueError("[FullyAsyncRollouter] Missing async_training configuration") - + assert self.config.actor_rollout_ref.rollout.calculate_log_probs == True, "must rollout calculate log_probs" super()._validate_config() def _create_actor_rollout_classes(self): diff --git a/verl/experimental/agent_loop/agent_loop.py b/verl/experimental/agent_loop/agent_loop.py index e9383b109e5..743a4927b7e 100644 --- a/verl/experimental/agent_loop/agent_loop.py +++ b/verl/experimental/agent_loop/agent_loop.py @@ -136,6 +136,8 @@ class AgentLoopOutput(BaseModel): """Auxiliary performance metrics""" is_cancel: bool = False """Indicates whether the request was interrupted""" + log_probs: list[float] = None + """Response token log probs including LLM generated token, tool response token.""" # make hydra.utils.instantiate happy diff --git a/verl/experimental/agent_loop/partial_single_turn_agent_loop.py b/verl/experimental/agent_loop/partial_single_turn_agent_loop.py index 899b83f1866..c94788cd61d 100644 --- a/verl/experimental/agent_loop/partial_single_turn_agent_loop.py +++ b/verl/experimental/agent_loop/partial_single_turn_agent_loop.py @@ -51,7 +51,7 @@ async def run( metrics = {} request_id = uuid4().hex with simple_timer("generate_sequences", metrics): - response_ids, is_cancel = await self.server_manager.generate_for_partial( + response_ids, log_probs, is_cancel = await self.server_manager.generate_for_partial( request_id=request_id, prompt_ids=prompt_ids, sampling_params=sampling_params ) @@ -60,6 +60,7 @@ async def run( # 暂停待恢复样本, 把输出结果加到 response_ids 后,并重置 response_mask else: prompt_ids = output.prompt_ids + log_probs = output.log_probs + log_probs response_ids = output.response_ids + response_ids response_mask = [1] * len(response_ids) @@ -70,4 +71,5 @@ async def run( num_turns=2, metrics=metrics, is_cancel=is_cancel, + log_probs=log_probs ) diff --git a/verl/trainer/ppo/ray_trainer.py b/verl/trainer/ppo/ray_trainer.py index 60621021b30..4aa7102977f 100644 --- a/verl/trainer/ppo/ray_trainer.py +++ b/verl/trainer/ppo/ray_trainer.py @@ -1280,6 +1280,11 @@ def _process_batch_common(self, batch, metrics, timing_raw): "training/rollout_probs_diff_std": rollout_probs_diff_std.detach().item(), } ) + if self.config.async_training and self.config.async_training.use_rollout_log_probs: + print("use_rollout_log_probs") + batch.batch["old_log_probs"] = batch.batch["rollout_log_probs"] + del actor_old_log_probs + if self.use_reference_policy: # compute reference log_prob with marked_timer("ref", timing_raw, color="olive"): diff --git a/verl/workers/rollout/vllm_rollout/vllm_async_server.py b/verl/workers/rollout/vllm_rollout/vllm_async_server.py index 7ce640e33cb..970c309f84a 100644 --- a/verl/workers/rollout/vllm_rollout/vllm_async_server.py +++ b/verl/workers/rollout/vllm_rollout/vllm_async_server.py @@ -337,7 +337,7 @@ async def generate(self, prompt_ids: list[int], sampling_params: dict[str, Any], async def _generate_step(self, prompt_ids: list[int], sampling_params: dict[str, Any], request_id: str): max_tokens = self.max_model_len - len(prompt_ids) - sampling_params = SamplingParams(max_tokens=max_tokens, **sampling_params) + sampling_params = SamplingParams(max_tokens=max_tokens, logprobs=1, **sampling_params) prompt = TokensPrompt(prompt_token_ids=prompt_ids) generator = self.engine.generate(prompt=prompt, sampling_params=sampling_params, request_id=request_id) @@ -348,13 +348,13 @@ async def _generate_step(self, prompt_ids: list[int], sampling_params: dict[str, assert self.req_output[request_id] is not None async def generate_for_partial( - self, prompt_ids: list[int], sampling_params: dict[str, Any], request_id: str - ) -> tuple[Sequence[int], bool] | tuple[str, bool]: + self, prompt_ids: list[int], sampling_params: dict[str, Any], request_id: str + ) -> tuple[list[Any], list[Any], bool] | tuple[Sequence[int], list[float], Any]: # 设置中断标志 async with self.lock: if self.paused: # cancel 后, 所有任务直接返回,等待下次提交 - return [], True + return [], [], True self.cancel_event[request_id] = asyncio.Event() cancel_handle = asyncio.create_task(self.cancel_event[request_id].wait()) generation_handle = asyncio.create_task(self._generate_step(prompt_ids, sampling_params, request_id)) @@ -368,11 +368,20 @@ async def generate_for_partial( task.cancel() async with self.lock: + print(f"token_ids size: {len(self.req_output[request_id].outputs[0].token_ids)}") + print(f"log_probs size: {len(self.req_output[request_id].outputs[0].logprobs)}") token_ids = self.req_output[request_id].outputs[0].token_ids + log_probs: list[float] = [] + for i, x in enumerate(self.req_output[request_id].outputs[0].logprobs): + # sampling_params 中 logprobs 设置为1,只返回1个 + token_id = self.req_output[request_id].outputs[0].token_ids[i] + log_probs.append(x[token_id].logprob) + is_cancel = generation_handle not in done self.cancel_event.pop(request_id, None) self.req_output.pop(request_id, None) - return token_ids, is_cancel + + return token_ids, log_probs, is_cancel async def cancel(self): async with self.lock: From f27b916fcea413f2faa8e8d6f93822e53144e4ab Mon Sep 17 00:00:00 2001 From: ArronHZG Date: Fri, 29 Aug 2025 18:29:23 +0800 Subject: [PATCH 082/182] tensorboard step size refactor code fix message_queue total_train_steps int max_queue_size await self.max_steps_duration refactor print --- ...fsdp2_4_4.sh => dapo_7b_math_fsdp2_2_6.sh} | 9 +- recipe/fully_async_policy/detach_utils.py | 19 ++- recipe/fully_async_policy/fully_async_main.py | 14 +- .../fully_async_rollouter.py | 157 ++++++++---------- .../fully_async_policy/fully_async_trainer.py | 47 +++--- recipe/fully_async_policy/message_queue.py | 9 +- tests/special_e2e/run_fully_async_policy.sh | 3 +- verl/experimental/agent_loop/agent_loop.py | 24 ++- .../partial_single_turn_agent_loop.py | 3 +- .../agent_loop/single_turn_agent_loop.py | 6 +- .../agent_loop/tool_agent_loop.py | 7 +- verl/trainer/main_ppo.py | 1 + verl/trainer/ppo/ray_trainer.py | 1 - .../rollout/vllm_rollout/vllm_async_server.py | 14 +- 14 files changed, 155 insertions(+), 159 deletions(-) rename recipe/fully_async_policy/{dapo_7b_math_fsdp2_4_4.sh => dapo_7b_math_fsdp2_2_6.sh} (97%) diff --git a/recipe/fully_async_policy/dapo_7b_math_fsdp2_4_4.sh b/recipe/fully_async_policy/dapo_7b_math_fsdp2_2_6.sh similarity index 97% rename from recipe/fully_async_policy/dapo_7b_math_fsdp2_4_4.sh rename to recipe/fully_async_policy/dapo_7b_math_fsdp2_2_6.sh index 5fb85a66b6f..58017f0123b 100644 --- a/recipe/fully_async_policy/dapo_7b_math_fsdp2_4_4.sh +++ b/recipe/fully_async_policy/dapo_7b_math_fsdp2_2_6.sh @@ -69,15 +69,15 @@ NNODES=${NNODES:-1} NGPUS_PER_NODE=${NGPUS_PER_NODE:-8} # Fully async specific parameters -n_gpus_rollout=4 +n_gpus_rollout=2 n_gpus_training=$((NGPUS_PER_NODE - n_gpus_rollout)) train_prompt_bsz=0 gen_prompt_bsz=1 n_resp_per_prompt=16 -train_prompt_mini_bsz=4 -total_rollout_steps=$(((512*10))) -test_freq=-1 +train_prompt_mini_bsz=64 +total_rollout_steps=$(((512*100))) +test_freq=5 staleness_threshold=1 trigger_parameter_sync_step=16 partial_rollout=True @@ -140,6 +140,7 @@ $PYTHON_INTERPRETER -m recipe.fully_async_policy.fully_async_main \ actor_rollout_ref.rollout.val_kwargs.top_k=${top_k} \ actor_rollout_ref.rollout.val_kwargs.do_sample=True \ actor_rollout_ref.rollout.val_kwargs.n=1 \ + actor_rollout_ref.rollout.calculate_log_probs=True \ actor_rollout_ref.ref.fsdp_config.param_offload=${ref_offload} \ actor_rollout_ref.ref.ulysses_sequence_parallel_size=${sp_size} \ actor_rollout_ref.actor.fsdp_config.fsdp_size=${fsdp_size} \ diff --git a/recipe/fully_async_policy/detach_utils.py b/recipe/fully_async_policy/detach_utils.py index 986cb468932..af8dfe16857 100644 --- a/recipe/fully_async_policy/detach_utils.py +++ b/recipe/fully_async_policy/detach_utils.py @@ -13,7 +13,7 @@ # limitations under the License. import time from dataclasses import dataclass -from typing import Any, Dict, List +from typing import Any import numpy as np import torch @@ -36,20 +36,22 @@ class RolloutSample: full_batch: Any # AgentLoopOutput from generation - agent_loop_output_list: List[Any] # AgentLoopOutput + agent_loop_output_list: list[Any] # AgentLoopOutput # Metadata sample_id: str epoch: int # Processing metadata - processing_times: List[float] + processing_times: list[float] param_version: int + @dataclass class ValidateMetrics: - timing_raw: Dict[str, Any] - metrics: Dict[str, Any] + timing_raw: dict[str, Any] + metrics: dict[str, Any] + def prepare_single_generation_data(batch_dict, global_steps, rollout_n) -> DataProto: """ @@ -87,8 +89,7 @@ def prepare_single_generation_data(batch_dict, global_steps, rollout_n) -> DataP return full_batch -def process_rollout_log_probs(data_proto: DataProto, - rollout_log_probs: list[list[float]]) -> torch.Tensor: +def process_rollout_log_probs(data_proto: DataProto, rollout_log_probs: list[list[float]]) -> torch.Tensor: """ 根据 DataProto 中的 mask 逻辑处理 rollout_log_probs # attention_mask: [0,0,0,0,1,1,1,1, | 1,1,1,0,0,0,0,0] @@ -108,7 +109,6 @@ def process_rollout_log_probs(data_proto: DataProto, # 初始化结果 tensor rollout_log_probs_tensor = torch.zeros((bsz, response_length), dtype=torch.float32) - 1 - for i, log_probs_seq in enumerate(rollout_log_probs): # 获取当前样本的有效长度(mask 中为 1 的位置数量) valid_length = response_mask[i].sum().item() @@ -123,12 +123,13 @@ def process_rollout_log_probs(data_proto: DataProto, rollout_log_probs_tensor = rollout_log_probs_tensor.to(torch.float32) return rollout_log_probs_tensor + def merge_rollout_sample(config, tokenizer, rs: RolloutSample): # 第一步:从 AgentLoopOutput 创建生成结果的 DataProto gen_batch_output = postprocess_agent_loop_outputs(rs.agent_loop_output_list, tokenizer, config) rollout_log_probs = [x.log_probs for x in rs.agent_loop_output_list] rollout_log_probs = process_rollout_log_probs(gen_batch_output, rollout_log_probs) - gen_batch_output.batch['rollout_log_probs'] = rollout_log_probs.to(torch.float32) + gen_batch_output.batch["rollout_log_probs"] = rollout_log_probs.to(torch.float32) # 第二步:添加 uid rs.full_batch.non_tensor_batch["uid"] = np.array([f"uid_{rs.sample_id}"] * len(rs.full_batch), dtype=object) diff --git a/recipe/fully_async_policy/fully_async_main.py b/recipe/fully_async_policy/fully_async_main.py index e662aec23bf..2b5663bd5ea 100644 --- a/recipe/fully_async_policy/fully_async_main.py +++ b/recipe/fully_async_policy/fully_async_main.py @@ -185,8 +185,18 @@ def _initialize_components(self, config) -> None: print("[ASYNC MAIN] Creating FullyAsyncTrainer...") self._create_trainer(config) - print("[ASYNC MAIN] Creating MessageQueue...") + # 同步require samples + required_samples = ray.get(self.components["trainer"].get_required_samples.remote()) + ray.get(self.components["rollouter"].set_required_samples.remote(required_samples)) + + # 同步total_train_steps + total_train_steps = ray.get(self.components["rollouter"].get_total_train_steps.remote()) + print(f"total_train_steps {total_train_steps}") + ray.get(self.components["trainer"].set_total_train_steps.remote(total_train_steps)) + + # 获取 max_queue_size (使用同步方法避免异步返回值问题) max_queue_size = ray.get(self.components["rollouter"].get_max_queue_size.remote()) + print(f"[ASYNC MAIN] Creating MessageQueue... max_queue_size {max_queue_size}") message_queue = MessageQueue.remote(config, max_queue_size) message_queue_client = MessageQueueClient(message_queue) self.components["message_queue"] = message_queue @@ -204,9 +214,7 @@ def _initialize_components(self, config) -> None: rollouter=self.components["rollouter"], mq=self.components["message_queue_client"], ) - ray.get(self.components["trainer"].set_parameter_synchronizer.remote(param_synchronizer)) - ray.get(self.components["rollouter"].set_parameter_synchronizer.remote(param_synchronizer)) ray.get(param_synchronizer.sync_weights.remote(0)) diff --git a/recipe/fully_async_policy/fully_async_rollouter.py b/recipe/fully_async_policy/fully_async_rollouter.py index b2bcebede58..83bc2c0ce8a 100644 --- a/recipe/fully_async_policy/fully_async_rollouter.py +++ b/recipe/fully_async_policy/fully_async_rollouter.py @@ -17,14 +17,12 @@ import ray from omegaconf import OmegaConf -from tqdm import tqdm from recipe.fully_async_policy.detach_utils import ( RolloutSample, - calculate_one_step_size, ValidateMetrics, - prepare_single_generation_data, merge_rollout_sample, + prepare_single_generation_data, ) from recipe.fully_async_policy.message_queue import MessageQueueClient from verl.single_controller.ray import RayClassWithInitArgs, RayWorkerGroup @@ -102,93 +100,81 @@ def __init__( if self.config.rollout.total_rollout_steps is not None: self.total_rollout_steps = min(self.config.rollout.total_rollout_steps, self.total_rollout_steps) print(f"[FullyAsyncRollouter] Total rollout steps: {self.total_rollout_steps}") + self.total_train_steps = None + + # ==================== fully async config ==================== # Rollouter parameter configuration self.message_queue_client = None - self.current_param_version = 0 + # Worker groups: rollout_wg is same to actor_rollout_wg + self.rollout_wg = None + self.actor_rollout_wg = None + self.async_rollout_manager = None - # Freshness control - improved configuration management - async_config = config.async_training - self.staleness_threshold = async_config.get("staleness_threshold", 3) + # Config + self.staleness_threshold: int = config.async_training.get("staleness_threshold", 1) + self.required_samples = None + self.max_required_samples = None + # 单次最多扔一次更新需要的样本 + self.max_concurrent_samples = None + # queue size + self.max_queue_size = None # Statistics + self.current_param_version = 0 self.total_generated_samples = 0 self.staleness_samples = 0 self.dropped_stale_samples = 0 - - # Worker groups - self.rollout_wg = None - self.message_queue_client = None + self.processed_sample_count = 0 # 已处理的样本计数 + self.global_steps = 0 # Concurrency control self.paused = False self.running = True + # 通过 pause 和 resume 控制 monitor_loop 中,是否进行 尝试恢复 操作 + self.monitor_loop_trigger = True # Initialize async locks directly self.lock = asyncio.Lock() self.condition = asyncio.Condition(self.lock) - # Pause/resume statistics - self.total_pause_time = 0.0 - self.last_pause_time = None - - # Parameter synchronization related - self.param_synchronizer = None - - self.async_rollout_manager = None - - # Calculate the samples needed for a train, used to calculate staleness and interrupt rollout - self.required_samples = calculate_one_step_size( - self.minimal_bsz, config.actor_rollout_ref.actor.ppo_mini_batch_size - ) - self.max_required_samples = ( - self.required_samples * (self.staleness_threshold + 1) * config.async_training.trigger_parameter_sync_step - ) - print( - f"[FullyAsyncRollouter] required_samples : {self.required_samples} " - f"max_required_samples: {self.max_required_samples}" - ) - - # 单次最多扔一次更新需要的样本 - self.max_concurrent_samples = self.required_samples - - # 流式处理统计 - self.processed_sample_count = 0 # 已处理的样本计数 - self.active_sample_count = 0 # 当前正在处理的样本数 - self.queue_full_pause_count = 0 # 队列满导致的暂停次数 - - # queue size - self.max_queue_size = self.max_required_samples * 10 # x 10 avoid deadlock - print(f"[FullyAsyncRollouter] {self.max_queue_size}") - # 初始化异步队列 - self.pending_queue = asyncio.Queue(maxsize=100) + self.pending_queue = asyncio.Queue(maxsize=128) self.active_tasks = set() self.result_queue = asyncio.Queue() self.cancel_queue = asyncio.Queue() - # 通过 pause 和 resume 控制 monitor_loop 中,是否进行 尝试恢复 操作 - self.monitor_loop_trigger = True - - self.update_param_version_time = 0 - self.global_steps = 0 - - self.progress_bar = tqdm( - total=self.total_rollout_steps / ( - self.required_samples * self.config.async_training.trigger_parameter_sync_step), - initial=self.global_steps, desc="Training Progress" - ) - async def set_message_queue_client(self, message_queue_client: MessageQueueClient): """Set message queue client""" async with self.lock: self.message_queue_client = message_queue_client - async def set_parameter_synchronizer(self, param_synchronizer): - """Set parameter synchronizer""" + async def set_required_samples(self, required_samples: int): async with self.lock: - self.param_synchronizer = param_synchronizer + self.required_samples = int(required_samples) + self.max_required_samples = ( + self.required_samples + * (self.staleness_threshold + 1) + * self.config.async_training.trigger_parameter_sync_step + ) + self.total_train_steps = int( + self.total_rollout_steps + / (self.required_samples * self.config.async_training.trigger_parameter_sync_step) + ) + + # 单次最多扔一次更新需要的样本 + self.max_concurrent_samples = self.required_samples + self.max_queue_size = self.max_required_samples + + print( + f"[FullyAsyncRollouter] required_samples : {self.required_samples} " + f"max_required_samples: {self.max_required_samples} " + f"max_queue_size: {self.max_queue_size} " + f"total_train_steps: {self.total_train_steps} " + f"total_rollout_steps: {self.total_rollout_steps} " + f"max_concurrent_samples: {self.max_concurrent_samples} " + ) def get_rollout_wg(self): """Get rollout worker group""" @@ -197,6 +183,9 @@ def get_rollout_wg(self): def get_max_queue_size(self): return self.max_queue_size + def get_total_train_steps(self): + return self.total_train_steps + async def update_param_version(self, version: int): """Update current parameter version""" async with self.lock: @@ -209,24 +198,22 @@ async def update_param_version(self, version: int): f"Parameter version updated from {old_version} to {version}" ) timing_raw = {} - self.update_param_version_time += 1 is_last_step = self.global_steps >= self.total_training_steps - if (self.val_reward_fn is not None - and self.config.trainer.test_freq > 0 - and ((self.global_steps > 0 and self.global_steps % self.config.trainer.test_freq == 0) - or is_last_step)): + if ( + self.val_reward_fn is not None + and self.config.trainer.test_freq > 0 + and ((self.global_steps > 0 and self.global_steps % self.config.trainer.test_freq == 0) or is_last_step) + ): with marked_timer("testing", timing_raw, color="green"): val_metrics: dict = self._validate() data = ValidateMetrics(timing_raw=timing_raw, metrics=val_metrics) - self.message_queue_client.put_validate(ray.cloudpickle.dumps(data)) - if version > 0: - self.progress_bar.update(1) + await self.message_queue_client.put_validate(ray.cloudpickle.dumps(data)) def _validate_config(self): # Validate asynchronous training configuration if not hasattr(self.config, "async_training"): raise ValueError("[FullyAsyncRollouter] Missing async_training configuration") - assert self.config.actor_rollout_ref.rollout.calculate_log_probs == True, "must rollout calculate log_probs" + assert self.config.actor_rollout_ref.rollout.calculate_log_probs, "must rollout calculate log_probs" super()._validate_config() def _create_actor_rollout_classes(self): @@ -388,17 +375,17 @@ async def _process_single_sample_streaming(self, rollout_sample: RolloutSample): is_cancel = False # 收集所有信息 for agent_loop in agent_loop_output_list: - if is_cancel == False and agent_loop.is_cancel: + if not is_cancel and agent_loop.is_cancel: is_cancel = True - rollout_data = { - "cost": [f"{agent_loop.metrics.generate_sequences:.2f}s" for agent_loop in agent_loop_output_list], - "len": [len(agent_loop.response_ids) for agent_loop in agent_loop_output_list], - } - if is_cancel: - rollout_data["cancel"] = [agent_loop.is_cancel for agent_loop in agent_loop_output_list] - formatted_data = pformat(rollout_data, width=200, compact=True) - print(f"[FullyAsyncRollouter] rollout {rollout_sample.sample_id} {formatted_data}") + # rollout_data = { + # "cost": [f"{agent_loop.metrics.generate_sequences:.2f}s" for agent_loop in agent_loop_output_list], + # "len": [len(agent_loop.response_ids) for agent_loop in agent_loop_output_list], + # } + # if is_cancel: + # rollout_data["cancel"] = [agent_loop.is_cancel for agent_loop in agent_loop_output_list] + # formatted_data = pformat(rollout_data, width=200, compact=True) + # print(f"[FullyAsyncRollouter] rollout {rollout_sample.sample_id} {formatted_data}") if is_cancel: # 放入 cancel 队列中,等待恢复生成 @@ -440,12 +427,12 @@ async def _streaming_generation_main(self): config=OmegaConf.to_container(self.config, resolve=True), ) - # load checkpoint before doing anything - self._load_checkpoint() # TODO: 检查是否需要 + # load checkpoint before doing anything + self._load_checkpoint() # TODO: 检查是否需要 # perform validation before training # currently, we only support validation using the reward_function. - async with self.lock: # TODO: 检查是否需要锁 + async with self.lock: # TODO: 检查是否需要锁 if self.val_reward_fn is not None and self.config.trainer.get("val_before_train", True): print("Initial validation metric") val_metrics = self._validate() @@ -514,8 +501,6 @@ async def fit(self): if self.message_queue_client is None: raise ValueError("MessageQueue client not set. Call set_message_queue_client() first.") - if self.param_synchronizer is None: - raise ValueError("param_synchronizer client not set. Call set_parameter_synchronizer() first.") # 设置运行状态 async with self.lock: @@ -550,8 +535,8 @@ async def _async_monitor_loop(self): Function 2: Trigger rollout recovery """ last_stats_time = time.time() - stats_interval = 30.0 - check_interval = 5.0 + stats_interval = 60.0 + check_interval = 10.0 while True: async with self.lock: @@ -563,6 +548,8 @@ async def _async_monitor_loop(self): if current_time - last_stats_time >= stats_interval: stats = await self.get_statistics() print(f"[FullyAsyncRollouter][MonitorLoop][Statistics] {pformat(stats)}") + data = ValidateMetrics(timing_raw={}, metrics=stats) + await self.message_queue_client.put_validate(ray.cloudpickle.dumps(data)) last_stats_time = current_time # pause 和 resume 之间,不进行恢复操作 diff --git a/recipe/fully_async_policy/fully_async_trainer.py b/recipe/fully_async_policy/fully_async_trainer.py index d6d44babb2a..d6f22ba312a 100644 --- a/recipe/fully_async_policy/fully_async_trainer.py +++ b/recipe/fully_async_policy/fully_async_trainer.py @@ -12,19 +12,18 @@ # See the License for the specific language governing permissions and # limitations under the License. -import logging import time import warnings from datetime import datetime -from pprint import pprint from typing import Any import ray from omegaconf import OmegaConf +from tqdm import tqdm from recipe.fully_async_policy.detach_utils import ( + ValidateMetrics, assemble_batch_from_rollout_samples, - calculate_one_step_size, ValidateMetrics, ) from recipe.fully_async_policy.message_queue import MessageQueueClient from verl.single_controller.ray import RayClassWithInitArgs, RayWorkerGroup @@ -38,8 +37,6 @@ ) from verl.utils.debug import marked_timer -logger = logging.getLogger(__name__) - @ray.remote(num_cpus=10) class FullyAsyncTrainer(RayPPOTrainer): @@ -103,15 +100,25 @@ def __init__( self.param_synchronizer = None # Statistics + # we start from step 1 + self.global_steps = 1 + self.local_trigger_step = 1 self.processed_samples = 0 self.stale_samples_processed = 0 self.current_param_version = 0 - - self.local_trigger_step = 1 + self.total_train_steps = None + self.progress_bar = None self.trigger_parameter_sync_step = config.async_training.trigger_parameter_sync_step - self.required_samples = calculate_one_step_size( - self.minimal_bsz, config.actor_rollout_ref.actor.ppo_mini_batch_size + # calculate required_samples + ppo_mini_batch_size = config.actor_rollout_ref.actor.ppo_mini_batch_size + rollout_n = config.actor_rollout_ref.rollout.n + if ppo_mini_batch_size % rollout_n != 0: + raise ValueError( + f"PPO mini batch size ({ppo_mini_batch_size}) must be divisible by rollout n ({rollout_n})" + ) + self.required_samples = int( + self.minimal_bsz * config.actor_rollout_ref.actor.ppo_mini_batch_size / config.actor_rollout_ref.rollout.n ) def set_message_queue_client(self, message_queue_client: MessageQueueClient): @@ -122,10 +129,17 @@ def set_parameter_synchronizer(self, param_synchronizer): """Set parameter synchronizer""" self.param_synchronizer = param_synchronizer + def set_total_train_steps(self, total_train_steps): + self.total_train_steps = total_train_steps + self.progress_bar = tqdm(total=self.total_train_steps, initial=0, desc="Training Progress") + def get_actor_wg(self): """Get actor worker group""" return self.actor_wg + def get_required_samples(self): + return self.required_samples + def _get_samples_from_queue(self) -> tuple[None, None] | tuple[int, Any]: """ Get samples from message queue and compose gen_batch_output @@ -166,7 +180,7 @@ def _get_samples_from_queue(self) -> tuple[None, None] | tuple[int, Any]: consumer_end = time.time() if not queue_samples or len(queue_samples) < self.required_samples: - logger.warning("not enough samples collected after loop") + print("[FullyAsyncTrainer] not enough samples collected after loop") return None, None print( @@ -230,22 +244,16 @@ def fit(self): from verl.utils.tracking import Tracking - self.logger = Tracking( + logger = Tracking( project_name=self.config.trainer.project_name, experiment_name=self.config.trainer.experiment_name, default_backend=self.config.trainer.logger, config=OmegaConf.to_container(self.config, resolve=True), ) - self.global_steps = 0 - # load checkpoint before doing anything self._load_checkpoint() - - # we start from step 1 - self.global_steps += 1 self.max_steps_duration = 0 - # Use queue mode, no need for traditional dataloader iterator # Initialize to get the first batch of data while True: @@ -293,11 +301,9 @@ def fit(self): self._check_save_checkpoint(False, timing_raw) self._collect_metrics(batch, 0, metrics, timing_raw) - pprint(metrics) + logger.log(data=metrics, step=self.global_steps) # Trigger parameter synchronization after training step - time_str = datetime.now().strftime("%H:%M:%S.%f")[:-3] - print( f"[FullyAsyncTrainer] global_steps: {self.global_steps} " f"local_trigger_step: {self.local_trigger_step} " @@ -316,6 +322,7 @@ def _trigger_parameter_sync_after_step(self): self.local_trigger_step = 1 self.current_param_version = self.current_param_version + 1 ray.get(self.param_synchronizer.sync_weights.remote(self.current_param_version)) + self.progress_bar.update(1) return else: self.local_trigger_step += 1 diff --git a/recipe/fully_async_policy/message_queue.py b/recipe/fully_async_policy/message_queue.py index 6a425c50478..13e1a3e21e4 100644 --- a/recipe/fully_async_policy/message_queue.py +++ b/recipe/fully_async_policy/message_queue.py @@ -32,13 +32,15 @@ class MessageQueue: def __init__(self, config: DictConfig, max_queue_size: int = 1000): self.config = config - self.max_queue_size = max_queue_size - self.queue = deque(maxlen=max_queue_size) + # 确保 max_queue_size 不为 None + if max_queue_size is None: + raise ValueError(f"max_queue_size cannot be None, got: {max_queue_size}") + self.max_queue_size = int(max_queue_size) + self.queue = deque(maxlen=self.max_queue_size) self.current_param_version = 0 self.val_queue = deque() - try: if hasattr(config, "async_training") and config.async_training is not None: self.staleness_threshold = getattr(config.async_training, "staleness_threshold", 3) @@ -203,7 +205,6 @@ async def get_validate(self): return None - class MessageQueueClient: """Asyncio-compatible MessageQueue client for communicating with MessageQueue Actor""" diff --git a/tests/special_e2e/run_fully_async_policy.sh b/tests/special_e2e/run_fully_async_policy.sh index 1a47b0fd06e..64f9fa82825 100644 --- a/tests/special_e2e/run_fully_async_policy.sh +++ b/tests/special_e2e/run_fully_async_policy.sh @@ -55,7 +55,7 @@ n_gpus_training=$((NUM_GPUS - n_gpus_rollout)) train_prompt_bsz=0 gen_prompt_bsz=1 n_resp_per_prompt=16 -train_prompt_mini_bsz=4 +train_prompt_mini_bsz=32 total_rollout_steps=$(((128*2))) test_freq=2 staleness_threshold=1 @@ -79,6 +79,7 @@ common_params=( data.gen_batch_size=${gen_prompt_bsz} data.return_raw_chat=${return_raw_chat} actor_rollout_ref.rollout.n=${n_resp_per_prompt} + actor_rollout_ref.rollout.calculate_log_probs=True algorithm.adv_estimator=${adv_estimator} algorithm.use_kl_in_reward=${use_kl_in_reward} algorithm.kl_ctrl.kl_coef=${kl_coef} diff --git a/verl/experimental/agent_loop/agent_loop.py b/verl/experimental/agent_loop/agent_loop.py index 743a4927b7e..8c49390f456 100644 --- a/verl/experimental/agent_loop/agent_loop.py +++ b/verl/experimental/agent_loop/agent_loop.py @@ -16,9 +16,8 @@ import logging import os import random -import time from abc import ABC, abstractmethod -from typing import Any, Optional, List +from typing import Any, Optional import hydra import numpy as np @@ -105,6 +104,7 @@ async def generate( return output async def generate_for_partial(self, request_id, prompt_ids, sampling_params): + """Generate tokens from prompt ids. with partial rollout function""" server = self._choose_server(request_id) output = await server.generate_for_partial.remote( request_id=request_id, @@ -385,8 +385,7 @@ async def generate_sequences(self, batch: DataProto) -> DataProto: return output async def generate_sequences_no_post( - self, - batch: DataProto, partial_output_list: Optional[List[AgentLoopOutput]] + self, batch: DataProto, partial_output_list: Optional[list[AgentLoopOutput]] ) -> list[AgentLoopOutput]: """Generate sequences from agent loop. @@ -433,11 +432,9 @@ async def generate_sequences_no_post( if not partial_output_list: partial_output_list = [None] * len(batch) - for agent_name, messages, trajectory, partial_output in zip(agent_names, - raw_prompts, - trajectory_info, - partial_output_list, - strict=True): + for agent_name, messages, trajectory, partial_output in zip( + agent_names, raw_prompts, trajectory_info, partial_output_list, strict=True + ): tasks.append( asyncio.create_task( self._run_agent_loop(agent_name, messages.tolist(), sampling_params, trajectory, partial_output) @@ -610,10 +607,11 @@ def generate_sequences(self, prompts: DataProto) -> DataProto: output.meta_info = {"timing": timing} return output - async def generate_single_sample_async(self, - sample: DataProto, - partial_output_list: Optional[List[AgentLoopOutput]], - ) -> List[AgentLoopOutput]: + async def generate_single_sample_async( + self, + sample: DataProto, + partial_output_list: Optional[list[AgentLoopOutput]], + ) -> list[AgentLoopOutput]: """ 异步处理单个样本, 需要复制n次 diff --git a/verl/experimental/agent_loop/partial_single_turn_agent_loop.py b/verl/experimental/agent_loop/partial_single_turn_agent_loop.py index c94788cd61d..df4a4f3350a 100644 --- a/verl/experimental/agent_loop/partial_single_turn_agent_loop.py +++ b/verl/experimental/agent_loop/partial_single_turn_agent_loop.py @@ -35,7 +35,6 @@ def __init__(self, *args, **kwargs): async def run( self, messages: list[dict[str, Any]], sampling_params: dict[str, Any], output: Optional[AgentLoopOutput] ) -> AgentLoopOutput: - if not output: prompt_ids = await self.loop.run_in_executor( None, lambda: self.tokenizer.apply_chat_template(messages, add_generation_prompt=True, tokenize=True) @@ -71,5 +70,5 @@ async def run( num_turns=2, metrics=metrics, is_cancel=is_cancel, - log_probs=log_probs + log_probs=log_probs, ) diff --git a/verl/experimental/agent_loop/single_turn_agent_loop.py b/verl/experimental/agent_loop/single_turn_agent_loop.py index 492c1894cc5..df6e1991888 100644 --- a/verl/experimental/agent_loop/single_turn_agent_loop.py +++ b/verl/experimental/agent_loop/single_turn_agent_loop.py @@ -32,9 +32,9 @@ def __init__(self, *args, **kwargs): self.prompt_length = self.config.actor_rollout_ref.rollout.prompt_length self.response_length = self.config.actor_rollout_ref.rollout.response_length - async def run(self, messages: list[dict[str, Any]], - sampling_params: dict[str, Any], - output: Optional[AgentLoopOutput]) -> AgentLoopOutput: + async def run( + self, messages: list[dict[str, Any]], sampling_params: dict[str, Any], output: Optional[AgentLoopOutput] + ) -> AgentLoopOutput: metrics = {} request_id = uuid4().hex prompt_ids = await self.loop.run_in_executor( diff --git a/verl/experimental/agent_loop/tool_agent_loop.py b/verl/experimental/agent_loop/tool_agent_loop.py index a0642048dc7..7c945b7d4c9 100644 --- a/verl/experimental/agent_loop/tool_agent_loop.py +++ b/verl/experimental/agent_loop/tool_agent_loop.py @@ -56,10 +56,9 @@ def init_class(cls, config, tokenizer, **kwargs): cls.system_prompt = tokenizer.apply_chat_template([{}], add_generation_prompt=False, tokenize=True) @rollout_trace_op - async def run(self, - messages: list[dict[str, Any]], - sampling_params: dict[str, Any], - output: Optional[AgentLoopOutput]) -> AgentLoopOutput: + async def run( + self, messages: list[dict[str, Any]], sampling_params: dict[str, Any], output: Optional[AgentLoopOutput] + ) -> AgentLoopOutput: metrics = {} request_id = uuid4().hex prompt_ids = await self.loop.run_in_executor( diff --git a/verl/trainer/main_ppo.py b/verl/trainer/main_ppo.py index 7b34cbfaf23..4b240c6ffbf 100644 --- a/verl/trainer/main_ppo.py +++ b/verl/trainer/main_ppo.py @@ -38,6 +38,7 @@ def main(config): config_dict: Hydra configuration dictionary containing training parameters. """ from time import time + start_time = time() run_ppo(config) print(f"total time: {time() - start_time:.2f} seconds") diff --git a/verl/trainer/ppo/ray_trainer.py b/verl/trainer/ppo/ray_trainer.py index 4aa7102977f..e61b1dc5fe0 100644 --- a/verl/trainer/ppo/ray_trainer.py +++ b/verl/trainer/ppo/ray_trainer.py @@ -1281,7 +1281,6 @@ def _process_batch_common(self, batch, metrics, timing_raw): } ) if self.config.async_training and self.config.async_training.use_rollout_log_probs: - print("use_rollout_log_probs") batch.batch["old_log_probs"] = batch.batch["rollout_log_probs"] del actor_old_log_probs diff --git a/verl/workers/rollout/vllm_rollout/vllm_async_server.py b/verl/workers/rollout/vllm_rollout/vllm_async_server.py index 970c309f84a..3b3e9542252 100644 --- a/verl/workers/rollout/vllm_rollout/vllm_async_server.py +++ b/verl/workers/rollout/vllm_rollout/vllm_async_server.py @@ -15,16 +15,14 @@ import logging import os import pickle -from contextlib import ExitStack -from typing import Any, Callable, Optional, Coroutine, Sequence +from typing import Any, Callable, Optional, Sequence import ray import zmq -from omegaconf import DictConfig, ListConfig +from omegaconf import DictConfig from starlette.requests import Request from starlette.responses import JSONResponse, StreamingResponse -from vllm import SamplingParams, RequestOutput -from vllm.config import CompilationConfig, CompilationLevel +from vllm import SamplingParams from vllm.engine.arg_utils import AsyncEngineArgs from vllm.entrypoints.logger import RequestLogger from vllm.entrypoints.openai.protocol import ChatCompletionRequest, ChatCompletionResponse, ErrorResponse @@ -348,7 +346,7 @@ async def _generate_step(self, prompt_ids: list[int], sampling_params: dict[str, assert self.req_output[request_id] is not None async def generate_for_partial( - self, prompt_ids: list[int], sampling_params: dict[str, Any], request_id: str + self, prompt_ids: list[int], sampling_params: dict[str, Any], request_id: str ) -> tuple[list[Any], list[Any], bool] | tuple[Sequence[int], list[float], Any]: # 设置中断标志 async with self.lock: @@ -368,19 +366,15 @@ async def generate_for_partial( task.cancel() async with self.lock: - print(f"token_ids size: {len(self.req_output[request_id].outputs[0].token_ids)}") - print(f"log_probs size: {len(self.req_output[request_id].outputs[0].logprobs)}") token_ids = self.req_output[request_id].outputs[0].token_ids log_probs: list[float] = [] for i, x in enumerate(self.req_output[request_id].outputs[0].logprobs): # sampling_params 中 logprobs 设置为1,只返回1个 token_id = self.req_output[request_id].outputs[0].token_ids[i] log_probs.append(x[token_id].logprob) - is_cancel = generation_handle not in done self.cancel_event.pop(request_id, None) self.req_output.pop(request_id, None) - return token_ids, log_probs, is_cancel async def cancel(self): From 2f8971315c6b50811f375ec6f835910143d85d90 Mon Sep 17 00:00:00 2001 From: ArronHZG Date: Sun, 31 Aug 2025 20:46:06 +0800 Subject: [PATCH 083/182] update metrics --- recipe/fully_async_policy/detach_utils.py | 6 ++++++ recipe/fully_async_policy/fully_async_rollouter.py | 5 +++-- recipe/fully_async_policy/fully_async_trainer.py | 13 +++---------- 3 files changed, 12 insertions(+), 12 deletions(-) diff --git a/recipe/fully_async_policy/detach_utils.py b/recipe/fully_async_policy/detach_utils.py index af8dfe16857..79df36652f7 100644 --- a/recipe/fully_async_policy/detach_utils.py +++ b/recipe/fully_async_policy/detach_utils.py @@ -45,6 +45,7 @@ class RolloutSample: # Processing metadata processing_times: list[float] param_version: int + rollout_status: dict[str, Any] @dataclass @@ -180,6 +181,9 @@ def assemble_batch_from_rollout_samples( rollout_samples_batch = [] processing_times = [] + rollout_status = rollout_samples[0].rollout_status + # 为 rollout_status 的所有 key 添加前缀 + rollout_status = {f"fully_async/{key}": value for key, value in rollout_status.items()} for rs in rollout_samples: rollout_samples_batch.append(rs.full_batch) @@ -208,6 +212,7 @@ def assemble_batch_from_rollout_samples( "tp99_processing_time": np.percentile(processing_times, 99), # 99百分位 "tp95_processing_time": np.percentile(processing_times, 95), # 95百分位也很有用 } + processing_time_stats = {f"fully_async/{key}": value for key, value in processing_time_stats.items()} # 创建 meta_info final_batch.meta_info.update( @@ -215,6 +220,7 @@ def assemble_batch_from_rollout_samples( "rollout_param_versions": param_versions, "param_version_diversity": len(set(param_versions)) if param_versions else 0, **processing_time_stats, + **rollout_status, } ) diff --git a/recipe/fully_async_policy/fully_async_rollouter.py b/recipe/fully_async_policy/fully_async_rollouter.py index 83bc2c0ce8a..2e22dae6d7b 100644 --- a/recipe/fully_async_policy/fully_async_rollouter.py +++ b/recipe/fully_async_policy/fully_async_rollouter.py @@ -371,6 +371,7 @@ async def _process_single_sample_streaming(self, rollout_sample: RolloutSample): # 直接更新 RolloutSample 对象,填充剩余字段 rollout_sample.agent_loop_output_list = agent_loop_output_list rollout_sample.param_version = self.current_param_version + rollout_sample.rollout_status = await self.get_statistics() is_cancel = False # 收集所有信息 @@ -438,6 +439,8 @@ async def _streaming_generation_main(self): val_metrics = self._validate() assert val_metrics, f"{val_metrics=}" pprint(f"[FullyAsyncRollouter] Initial validation metrics: {val_metrics}") + data = ValidateMetrics(timing_raw={}, metrics=val_metrics) + await self.message_queue_client.put_validate(ray.cloudpickle.dumps(data)) if self.config.trainer.get("val_only", False): return @@ -548,8 +551,6 @@ async def _async_monitor_loop(self): if current_time - last_stats_time >= stats_interval: stats = await self.get_statistics() print(f"[FullyAsyncRollouter][MonitorLoop][Statistics] {pformat(stats)}") - data = ValidateMetrics(timing_raw={}, metrics=stats) - await self.message_queue_client.put_validate(ray.cloudpickle.dumps(data)) last_stats_time = current_time # pause 和 resume 之间,不进行恢复操作 diff --git a/recipe/fully_async_policy/fully_async_trainer.py b/recipe/fully_async_policy/fully_async_trainer.py index d6f22ba312a..af406623145 100644 --- a/recipe/fully_async_policy/fully_async_trainer.py +++ b/recipe/fully_async_policy/fully_async_trainer.py @@ -285,16 +285,9 @@ def fit(self): "fully_async/current_param_version": self.current_param_version, } ) - for metric in [ - "avg_processing_time", - "max_processing_time", - "min_processing_time", - "tp50_processing_time", - "tp99_processing_time", - "tp95_processing_time", - "param_version_diversity", - ]: - metrics[f"fully_async/{metric}"] = batch.meta_info.get(metric, 0) + for key, value in batch.meta_info: + if key.startswith("fully_async"): + metrics[key] = value batch, reward_extra_infos_dict = self._process_batch_common(batch, metrics, timing_raw) self._log_rollout(batch, reward_extra_infos_dict, timing_raw) From 1c3b32b822a89e96b1afca6518c6f5c1717ba1d7 Mon Sep 17 00:00:00 2001 From: ArronHZG Date: Sun, 31 Aug 2025 21:01:09 +0800 Subject: [PATCH 084/182] update metrics --- recipe/fully_async_policy/fully_async_rollouter.py | 1 + 1 file changed, 1 insertion(+) diff --git a/recipe/fully_async_policy/fully_async_rollouter.py b/recipe/fully_async_policy/fully_async_rollouter.py index 2e22dae6d7b..44768cacb29 100644 --- a/recipe/fully_async_policy/fully_async_rollouter.py +++ b/recipe/fully_async_policy/fully_async_rollouter.py @@ -271,6 +271,7 @@ async def _feed_samples(self): epoch=epoch, param_version=0, # 待处理后填充 processing_times=[], + rollout_status={}, ) await self.pending_queue.put(rollout_sample) From 1bea47c16f5a220f39b8d9cff07a5e9a07684453 Mon Sep 17 00:00:00 2001 From: ArronHZG Date: Fri, 29 Aug 2025 17:22:55 +0800 Subject: [PATCH 085/182] rollout log probs tensorboard step size refactor code fix message_queue total_train_steps int max_queue_size await self.max_steps_duration refactor print update metrics update metrics update metrics --- .../config/fully_async_ppo_trainer.yaml | 1 + ...fsdp2_4_4.sh => dapo_7b_math_fsdp2_2_6.sh} | 9 +- recipe/fully_async_policy/detach_utils.py | 56 +++++- recipe/fully_async_policy/fully_async_main.py | 14 +- .../fully_async_rollouter.py | 159 ++++++++---------- .../fully_async_policy/fully_async_trainer.py | 60 +++---- recipe/fully_async_policy/message_queue.py | 9 +- tests/special_e2e/run_fully_async_policy.sh | 3 +- verl/experimental/agent_loop/agent_loop.py | 26 +-- .../partial_single_turn_agent_loop.py | 5 +- .../agent_loop/single_turn_agent_loop.py | 6 +- .../agent_loop/tool_agent_loop.py | 7 +- verl/trainer/main_ppo.py | 1 + verl/trainer/ppo/ray_trainer.py | 4 + .../rollout/vllm_rollout/vllm_async_server.py | 21 ++- 15 files changed, 218 insertions(+), 163 deletions(-) rename recipe/fully_async_policy/{dapo_7b_math_fsdp2_4_4.sh => dapo_7b_math_fsdp2_2_6.sh} (97%) diff --git a/recipe/fully_async_policy/config/fully_async_ppo_trainer.yaml b/recipe/fully_async_policy/config/fully_async_ppo_trainer.yaml index 0714e107ee4..3334ee4f4d5 100644 --- a/recipe/fully_async_policy/config/fully_async_ppo_trainer.yaml +++ b/recipe/fully_async_policy/config/fully_async_ppo_trainer.yaml @@ -13,6 +13,7 @@ async_training: staleness_threshold: 1 # 样本新鲜度阈值 trigger_parameter_sync_step: 4 # >=1 train 每次训练一个batch, 迭代多少次后触发更新 partial_rollout: True # 同步参数时,是否中断 rollout + use_rollout_log_probs: True # Rollout配置 rollout: diff --git a/recipe/fully_async_policy/dapo_7b_math_fsdp2_4_4.sh b/recipe/fully_async_policy/dapo_7b_math_fsdp2_2_6.sh similarity index 97% rename from recipe/fully_async_policy/dapo_7b_math_fsdp2_4_4.sh rename to recipe/fully_async_policy/dapo_7b_math_fsdp2_2_6.sh index 5fb85a66b6f..58017f0123b 100644 --- a/recipe/fully_async_policy/dapo_7b_math_fsdp2_4_4.sh +++ b/recipe/fully_async_policy/dapo_7b_math_fsdp2_2_6.sh @@ -69,15 +69,15 @@ NNODES=${NNODES:-1} NGPUS_PER_NODE=${NGPUS_PER_NODE:-8} # Fully async specific parameters -n_gpus_rollout=4 +n_gpus_rollout=2 n_gpus_training=$((NGPUS_PER_NODE - n_gpus_rollout)) train_prompt_bsz=0 gen_prompt_bsz=1 n_resp_per_prompt=16 -train_prompt_mini_bsz=4 -total_rollout_steps=$(((512*10))) -test_freq=-1 +train_prompt_mini_bsz=64 +total_rollout_steps=$(((512*100))) +test_freq=5 staleness_threshold=1 trigger_parameter_sync_step=16 partial_rollout=True @@ -140,6 +140,7 @@ $PYTHON_INTERPRETER -m recipe.fully_async_policy.fully_async_main \ actor_rollout_ref.rollout.val_kwargs.top_k=${top_k} \ actor_rollout_ref.rollout.val_kwargs.do_sample=True \ actor_rollout_ref.rollout.val_kwargs.n=1 \ + actor_rollout_ref.rollout.calculate_log_probs=True \ actor_rollout_ref.ref.fsdp_config.param_offload=${ref_offload} \ actor_rollout_ref.ref.ulysses_sequence_parallel_size=${sp_size} \ actor_rollout_ref.actor.fsdp_config.fsdp_size=${fsdp_size} \ diff --git a/recipe/fully_async_policy/detach_utils.py b/recipe/fully_async_policy/detach_utils.py index 0296945a2ab..79df36652f7 100644 --- a/recipe/fully_async_policy/detach_utils.py +++ b/recipe/fully_async_policy/detach_utils.py @@ -13,7 +13,7 @@ # limitations under the License. import time from dataclasses import dataclass -from typing import Any, Dict, List +from typing import Any import numpy as np import torch @@ -36,20 +36,23 @@ class RolloutSample: full_batch: Any # AgentLoopOutput from generation - agent_loop_output_list: List[Any] # AgentLoopOutput + agent_loop_output_list: list[Any] # AgentLoopOutput # Metadata sample_id: str epoch: int # Processing metadata - processing_times: List[float] + processing_times: list[float] param_version: int + rollout_status: dict[str, Any] + @dataclass class ValidateMetrics: - timing_raw: Dict[str, Any] - metrics: Dict[str, Any] + timing_raw: dict[str, Any] + metrics: dict[str, Any] + def prepare_single_generation_data(batch_dict, global_steps, rollout_n) -> DataProto: """ @@ -87,9 +90,47 @@ def prepare_single_generation_data(batch_dict, global_steps, rollout_n) -> DataP return full_batch +def process_rollout_log_probs(data_proto: DataProto, rollout_log_probs: list[list[float]]) -> torch.Tensor: + """ + 根据 DataProto 中的 mask 逻辑处理 rollout_log_probs + # attention_mask: [0,0,0,0,1,1,1,1, | 1,1,1,0,0,0,0,0] + + Args: + data_proto: 包含 batch 信息的 DataProto 对象 + rollout_log_probs: 二维列表,每个子列表包含一个样本的 log_probs + + Returns: + torch.Tensor: 处理后的 log_probs tensor,形状为 [bsz, response_length] + """ + + batch = data_proto.batch + response_mask = batch["response_mask"] + bsz, response_length = response_mask.shape + + # 初始化结果 tensor + rollout_log_probs_tensor = torch.zeros((bsz, response_length), dtype=torch.float32) - 1 + + for i, log_probs_seq in enumerate(rollout_log_probs): + # 获取当前样本的有效长度(mask 中为 1 的位置数量) + valid_length = response_mask[i].sum().item() + + # 确保 log_probs_seq 的长度不超过有效长度 + actual_length = min(len(log_probs_seq), valid_length) + + # 将 log_probs 填入对应位置 + if actual_length > 0: + rollout_log_probs_tensor[i, :actual_length] = torch.tensor(log_probs_seq[:actual_length]) + + rollout_log_probs_tensor = rollout_log_probs_tensor.to(torch.float32) + return rollout_log_probs_tensor + + def merge_rollout_sample(config, tokenizer, rs: RolloutSample): # 第一步:从 AgentLoopOutput 创建生成结果的 DataProto gen_batch_output = postprocess_agent_loop_outputs(rs.agent_loop_output_list, tokenizer, config) + rollout_log_probs = [x.log_probs for x in rs.agent_loop_output_list] + rollout_log_probs = process_rollout_log_probs(gen_batch_output, rollout_log_probs) + gen_batch_output.batch["rollout_log_probs"] = rollout_log_probs.to(torch.float32) # 第二步:添加 uid rs.full_batch.non_tensor_batch["uid"] = np.array([f"uid_{rs.sample_id}"] * len(rs.full_batch), dtype=object) @@ -140,6 +181,9 @@ def assemble_batch_from_rollout_samples( rollout_samples_batch = [] processing_times = [] + rollout_status = rollout_samples[0].rollout_status + # 为 rollout_status 的所有 key 添加前缀 + rollout_status = {f"fully_async/{key}": value for key, value in rollout_status.items()} for rs in rollout_samples: rollout_samples_batch.append(rs.full_batch) @@ -168,6 +212,7 @@ def assemble_batch_from_rollout_samples( "tp99_processing_time": np.percentile(processing_times, 99), # 99百分位 "tp95_processing_time": np.percentile(processing_times, 95), # 95百分位也很有用 } + processing_time_stats = {f"fully_async/{key}": value for key, value in processing_time_stats.items()} # 创建 meta_info final_batch.meta_info.update( @@ -175,6 +220,7 @@ def assemble_batch_from_rollout_samples( "rollout_param_versions": param_versions, "param_version_diversity": len(set(param_versions)) if param_versions else 0, **processing_time_stats, + **rollout_status, } ) diff --git a/recipe/fully_async_policy/fully_async_main.py b/recipe/fully_async_policy/fully_async_main.py index e662aec23bf..2b5663bd5ea 100644 --- a/recipe/fully_async_policy/fully_async_main.py +++ b/recipe/fully_async_policy/fully_async_main.py @@ -185,8 +185,18 @@ def _initialize_components(self, config) -> None: print("[ASYNC MAIN] Creating FullyAsyncTrainer...") self._create_trainer(config) - print("[ASYNC MAIN] Creating MessageQueue...") + # 同步require samples + required_samples = ray.get(self.components["trainer"].get_required_samples.remote()) + ray.get(self.components["rollouter"].set_required_samples.remote(required_samples)) + + # 同步total_train_steps + total_train_steps = ray.get(self.components["rollouter"].get_total_train_steps.remote()) + print(f"total_train_steps {total_train_steps}") + ray.get(self.components["trainer"].set_total_train_steps.remote(total_train_steps)) + + # 获取 max_queue_size (使用同步方法避免异步返回值问题) max_queue_size = ray.get(self.components["rollouter"].get_max_queue_size.remote()) + print(f"[ASYNC MAIN] Creating MessageQueue... max_queue_size {max_queue_size}") message_queue = MessageQueue.remote(config, max_queue_size) message_queue_client = MessageQueueClient(message_queue) self.components["message_queue"] = message_queue @@ -204,9 +214,7 @@ def _initialize_components(self, config) -> None: rollouter=self.components["rollouter"], mq=self.components["message_queue_client"], ) - ray.get(self.components["trainer"].set_parameter_synchronizer.remote(param_synchronizer)) - ray.get(self.components["rollouter"].set_parameter_synchronizer.remote(param_synchronizer)) ray.get(param_synchronizer.sync_weights.remote(0)) diff --git a/recipe/fully_async_policy/fully_async_rollouter.py b/recipe/fully_async_policy/fully_async_rollouter.py index 04b2fe5dc54..44768cacb29 100644 --- a/recipe/fully_async_policy/fully_async_rollouter.py +++ b/recipe/fully_async_policy/fully_async_rollouter.py @@ -17,14 +17,12 @@ import ray from omegaconf import OmegaConf -from tqdm import tqdm from recipe.fully_async_policy.detach_utils import ( RolloutSample, - calculate_one_step_size, ValidateMetrics, - prepare_single_generation_data, merge_rollout_sample, + prepare_single_generation_data, ) from recipe.fully_async_policy.message_queue import MessageQueueClient from verl.single_controller.ray import RayClassWithInitArgs, RayWorkerGroup @@ -102,93 +100,81 @@ def __init__( if self.config.rollout.total_rollout_steps is not None: self.total_rollout_steps = min(self.config.rollout.total_rollout_steps, self.total_rollout_steps) print(f"[FullyAsyncRollouter] Total rollout steps: {self.total_rollout_steps}") + self.total_train_steps = None + + # ==================== fully async config ==================== # Rollouter parameter configuration self.message_queue_client = None - self.current_param_version = 0 + # Worker groups: rollout_wg is same to actor_rollout_wg + self.rollout_wg = None + self.actor_rollout_wg = None + self.async_rollout_manager = None - # Freshness control - improved configuration management - async_config = config.async_training - self.staleness_threshold = async_config.get("staleness_threshold", 3) + # Config + self.staleness_threshold: int = config.async_training.get("staleness_threshold", 1) + self.required_samples = None + self.max_required_samples = None + # 单次最多扔一次更新需要的样本 + self.max_concurrent_samples = None + # queue size + self.max_queue_size = None # Statistics + self.current_param_version = 0 self.total_generated_samples = 0 self.staleness_samples = 0 self.dropped_stale_samples = 0 - - # Worker groups - self.rollout_wg = None - self.message_queue_client = None + self.processed_sample_count = 0 # 已处理的样本计数 + self.global_steps = 0 # Concurrency control self.paused = False self.running = True + # 通过 pause 和 resume 控制 monitor_loop 中,是否进行 尝试恢复 操作 + self.monitor_loop_trigger = True # Initialize async locks directly self.lock = asyncio.Lock() self.condition = asyncio.Condition(self.lock) - # Pause/resume statistics - self.total_pause_time = 0.0 - self.last_pause_time = None - - # Parameter synchronization related - self.param_synchronizer = None - - self.async_rollout_manager = None - - # Calculate the samples needed for a train, used to calculate staleness and interrupt rollout - self.required_samples = calculate_one_step_size( - self.minimal_bsz, config.actor_rollout_ref.actor.ppo_mini_batch_size - ) - self.max_required_samples = ( - self.required_samples * (self.staleness_threshold + 1) * config.async_training.trigger_parameter_sync_step - ) - print( - f"[FullyAsyncRollouter] required_samples : {self.required_samples} " - f"max_required_samples: {self.max_required_samples}" - ) - - # 单次最多扔一次更新需要的样本 - self.max_concurrent_samples = self.required_samples - - # 流式处理统计 - self.processed_sample_count = 0 # 已处理的样本计数 - self.active_sample_count = 0 # 当前正在处理的样本数 - self.queue_full_pause_count = 0 # 队列满导致的暂停次数 - - # queue size - self.max_queue_size = self.max_required_samples * 10 # x 10 avoid deadlock - print(f"[FullyAsyncRollouter] {self.max_queue_size}") - # 初始化异步队列 - self.pending_queue = asyncio.Queue(maxsize=100) + self.pending_queue = asyncio.Queue(maxsize=128) self.active_tasks = set() self.result_queue = asyncio.Queue() self.cancel_queue = asyncio.Queue() - # 通过 pause 和 resume 控制 monitor_loop 中,是否进行 尝试恢复 操作 - self.monitor_loop_trigger = True - - self.update_param_version_time = 0 - self.global_steps = 0 - - self.progress_bar = tqdm( - total=self.total_rollout_steps / ( - self.required_samples * self.config.async_training.trigger_parameter_sync_step), - initial=self.global_steps, desc="Training Progress" - ) - async def set_message_queue_client(self, message_queue_client: MessageQueueClient): """Set message queue client""" async with self.lock: self.message_queue_client = message_queue_client - async def set_parameter_synchronizer(self, param_synchronizer): - """Set parameter synchronizer""" + async def set_required_samples(self, required_samples: int): async with self.lock: - self.param_synchronizer = param_synchronizer + self.required_samples = int(required_samples) + self.max_required_samples = ( + self.required_samples + * (self.staleness_threshold + 1) + * self.config.async_training.trigger_parameter_sync_step + ) + self.total_train_steps = int( + self.total_rollout_steps + / (self.required_samples * self.config.async_training.trigger_parameter_sync_step) + ) + + # 单次最多扔一次更新需要的样本 + self.max_concurrent_samples = self.required_samples + self.max_queue_size = self.max_required_samples + + print( + f"[FullyAsyncRollouter] required_samples : {self.required_samples} " + f"max_required_samples: {self.max_required_samples} " + f"max_queue_size: {self.max_queue_size} " + f"total_train_steps: {self.total_train_steps} " + f"total_rollout_steps: {self.total_rollout_steps} " + f"max_concurrent_samples: {self.max_concurrent_samples} " + ) def get_rollout_wg(self): """Get rollout worker group""" @@ -197,6 +183,9 @@ def get_rollout_wg(self): def get_max_queue_size(self): return self.max_queue_size + def get_total_train_steps(self): + return self.total_train_steps + async def update_param_version(self, version: int): """Update current parameter version""" async with self.lock: @@ -209,24 +198,22 @@ async def update_param_version(self, version: int): f"Parameter version updated from {old_version} to {version}" ) timing_raw = {} - self.update_param_version_time += 1 is_last_step = self.global_steps >= self.total_training_steps - if (self.val_reward_fn is not None - and self.config.trainer.test_freq > 0 - and ((self.global_steps > 0 and self.global_steps % self.config.trainer.test_freq == 0) - or is_last_step)): + if ( + self.val_reward_fn is not None + and self.config.trainer.test_freq > 0 + and ((self.global_steps > 0 and self.global_steps % self.config.trainer.test_freq == 0) or is_last_step) + ): with marked_timer("testing", timing_raw, color="green"): val_metrics: dict = self._validate() data = ValidateMetrics(timing_raw=timing_raw, metrics=val_metrics) - self.message_queue_client.put_validate(ray.cloudpickle.dumps(data)) - if version > 0: - self.progress_bar.update(1) + await self.message_queue_client.put_validate(ray.cloudpickle.dumps(data)) def _validate_config(self): # Validate asynchronous training configuration if not hasattr(self.config, "async_training"): raise ValueError("[FullyAsyncRollouter] Missing async_training configuration") - + assert self.config.actor_rollout_ref.rollout.calculate_log_probs, "must rollout calculate log_probs" super()._validate_config() def _create_actor_rollout_classes(self): @@ -284,6 +271,7 @@ async def _feed_samples(self): epoch=epoch, param_version=0, # 待处理后填充 processing_times=[], + rollout_status={}, ) await self.pending_queue.put(rollout_sample) @@ -384,21 +372,22 @@ async def _process_single_sample_streaming(self, rollout_sample: RolloutSample): # 直接更新 RolloutSample 对象,填充剩余字段 rollout_sample.agent_loop_output_list = agent_loop_output_list rollout_sample.param_version = self.current_param_version + rollout_sample.rollout_status = await self.get_statistics() is_cancel = False # 收集所有信息 for agent_loop in agent_loop_output_list: - if is_cancel == False and agent_loop.is_cancel: + if not is_cancel and agent_loop.is_cancel: is_cancel = True - rollout_data = { - "cost": [f"{agent_loop.metrics.generate_sequences:.2f}s" for agent_loop in agent_loop_output_list], - "len": [len(agent_loop.response_ids) for agent_loop in agent_loop_output_list], - } - if is_cancel: - rollout_data["cancel"] = [agent_loop.is_cancel for agent_loop in agent_loop_output_list] - formatted_data = pformat(rollout_data, width=200, compact=True) - print(f"[FullyAsyncRollouter] rollout {rollout_sample.sample_id} {formatted_data}") + # rollout_data = { + # "cost": [f"{agent_loop.metrics.generate_sequences:.2f}s" for agent_loop in agent_loop_output_list], + # "len": [len(agent_loop.response_ids) for agent_loop in agent_loop_output_list], + # } + # if is_cancel: + # rollout_data["cancel"] = [agent_loop.is_cancel for agent_loop in agent_loop_output_list] + # formatted_data = pformat(rollout_data, width=200, compact=True) + # print(f"[FullyAsyncRollouter] rollout {rollout_sample.sample_id} {formatted_data}") if is_cancel: # 放入 cancel 队列中,等待恢复生成 @@ -440,17 +429,19 @@ async def _streaming_generation_main(self): config=OmegaConf.to_container(self.config, resolve=True), ) - # load checkpoint before doing anything - self._load_checkpoint() # TODO: 检查是否需要 + # load checkpoint before doing anything + self._load_checkpoint() # TODO: 检查是否需要 # perform validation before training # currently, we only support validation using the reward_function. - async with self.lock: # TODO: 检查是否需要锁 + async with self.lock: # TODO: 检查是否需要锁 if self.val_reward_fn is not None and self.config.trainer.get("val_before_train", True): print("Initial validation metric") val_metrics = self._validate() assert val_metrics, f"{val_metrics=}" pprint(f"[FullyAsyncRollouter] Initial validation metrics: {val_metrics}") + data = ValidateMetrics(timing_raw={}, metrics=val_metrics) + await self.message_queue_client.put_validate(ray.cloudpickle.dumps(data)) if self.config.trainer.get("val_only", False): return @@ -514,8 +505,6 @@ async def fit(self): if self.message_queue_client is None: raise ValueError("MessageQueue client not set. Call set_message_queue_client() first.") - if self.param_synchronizer is None: - raise ValueError("param_synchronizer client not set. Call set_parameter_synchronizer() first.") # 设置运行状态 async with self.lock: @@ -550,8 +539,8 @@ async def _async_monitor_loop(self): Function 2: Trigger rollout recovery """ last_stats_time = time.time() - stats_interval = 30.0 - check_interval = 5.0 + stats_interval = 60.0 + check_interval = 10.0 while True: async with self.lock: diff --git a/recipe/fully_async_policy/fully_async_trainer.py b/recipe/fully_async_policy/fully_async_trainer.py index d6d44babb2a..6d74ee215f8 100644 --- a/recipe/fully_async_policy/fully_async_trainer.py +++ b/recipe/fully_async_policy/fully_async_trainer.py @@ -12,19 +12,18 @@ # See the License for the specific language governing permissions and # limitations under the License. -import logging import time import warnings from datetime import datetime -from pprint import pprint from typing import Any import ray from omegaconf import OmegaConf +from tqdm import tqdm from recipe.fully_async_policy.detach_utils import ( + ValidateMetrics, assemble_batch_from_rollout_samples, - calculate_one_step_size, ValidateMetrics, ) from recipe.fully_async_policy.message_queue import MessageQueueClient from verl.single_controller.ray import RayClassWithInitArgs, RayWorkerGroup @@ -38,8 +37,6 @@ ) from verl.utils.debug import marked_timer -logger = logging.getLogger(__name__) - @ray.remote(num_cpus=10) class FullyAsyncTrainer(RayPPOTrainer): @@ -103,15 +100,25 @@ def __init__( self.param_synchronizer = None # Statistics + # we start from step 1 + self.global_steps = 1 + self.local_trigger_step = 1 self.processed_samples = 0 self.stale_samples_processed = 0 self.current_param_version = 0 - - self.local_trigger_step = 1 + self.total_train_steps = None + self.progress_bar = None self.trigger_parameter_sync_step = config.async_training.trigger_parameter_sync_step - self.required_samples = calculate_one_step_size( - self.minimal_bsz, config.actor_rollout_ref.actor.ppo_mini_batch_size + # calculate required_samples + ppo_mini_batch_size = config.actor_rollout_ref.actor.ppo_mini_batch_size + rollout_n = config.actor_rollout_ref.rollout.n + if ppo_mini_batch_size % rollout_n != 0: + raise ValueError( + f"PPO mini batch size ({ppo_mini_batch_size}) must be divisible by rollout n ({rollout_n})" + ) + self.required_samples = int( + self.minimal_bsz * config.actor_rollout_ref.actor.ppo_mini_batch_size / config.actor_rollout_ref.rollout.n ) def set_message_queue_client(self, message_queue_client: MessageQueueClient): @@ -122,10 +129,17 @@ def set_parameter_synchronizer(self, param_synchronizer): """Set parameter synchronizer""" self.param_synchronizer = param_synchronizer + def set_total_train_steps(self, total_train_steps): + self.total_train_steps = total_train_steps + self.progress_bar = tqdm(total=self.total_train_steps, initial=0, desc="Training Progress") + def get_actor_wg(self): """Get actor worker group""" return self.actor_wg + def get_required_samples(self): + return self.required_samples + def _get_samples_from_queue(self) -> tuple[None, None] | tuple[int, Any]: """ Get samples from message queue and compose gen_batch_output @@ -166,7 +180,7 @@ def _get_samples_from_queue(self) -> tuple[None, None] | tuple[int, Any]: consumer_end = time.time() if not queue_samples or len(queue_samples) < self.required_samples: - logger.warning("not enough samples collected after loop") + print("[FullyAsyncTrainer] not enough samples collected after loop") return None, None print( @@ -230,22 +244,16 @@ def fit(self): from verl.utils.tracking import Tracking - self.logger = Tracking( + logger = Tracking( project_name=self.config.trainer.project_name, experiment_name=self.config.trainer.experiment_name, default_backend=self.config.trainer.logger, config=OmegaConf.to_container(self.config, resolve=True), ) - self.global_steps = 0 - # load checkpoint before doing anything self._load_checkpoint() - - # we start from step 1 - self.global_steps += 1 self.max_steps_duration = 0 - # Use queue mode, no need for traditional dataloader iterator # Initialize to get the first batch of data while True: @@ -277,27 +285,18 @@ def fit(self): "fully_async/current_param_version": self.current_param_version, } ) - for metric in [ - "avg_processing_time", - "max_processing_time", - "min_processing_time", - "tp50_processing_time", - "tp99_processing_time", - "tp95_processing_time", - "param_version_diversity", - ]: - metrics[f"fully_async/{metric}"] = batch.meta_info.get(metric, 0) + for key, value in batch.meta_info.items(): + if key.startswith("fully_async"): + metrics[key] = value batch, reward_extra_infos_dict = self._process_batch_common(batch, metrics, timing_raw) self._log_rollout(batch, reward_extra_infos_dict, timing_raw) self._check_save_checkpoint(False, timing_raw) self._collect_metrics(batch, 0, metrics, timing_raw) - pprint(metrics) + logger.log(data=metrics, step=self.global_steps) # Trigger parameter synchronization after training step - time_str = datetime.now().strftime("%H:%M:%S.%f")[:-3] - print( f"[FullyAsyncTrainer] global_steps: {self.global_steps} " f"local_trigger_step: {self.local_trigger_step} " @@ -316,6 +315,7 @@ def _trigger_parameter_sync_after_step(self): self.local_trigger_step = 1 self.current_param_version = self.current_param_version + 1 ray.get(self.param_synchronizer.sync_weights.remote(self.current_param_version)) + self.progress_bar.update(1) return else: self.local_trigger_step += 1 diff --git a/recipe/fully_async_policy/message_queue.py b/recipe/fully_async_policy/message_queue.py index 6a425c50478..13e1a3e21e4 100644 --- a/recipe/fully_async_policy/message_queue.py +++ b/recipe/fully_async_policy/message_queue.py @@ -32,13 +32,15 @@ class MessageQueue: def __init__(self, config: DictConfig, max_queue_size: int = 1000): self.config = config - self.max_queue_size = max_queue_size - self.queue = deque(maxlen=max_queue_size) + # 确保 max_queue_size 不为 None + if max_queue_size is None: + raise ValueError(f"max_queue_size cannot be None, got: {max_queue_size}") + self.max_queue_size = int(max_queue_size) + self.queue = deque(maxlen=self.max_queue_size) self.current_param_version = 0 self.val_queue = deque() - try: if hasattr(config, "async_training") and config.async_training is not None: self.staleness_threshold = getattr(config.async_training, "staleness_threshold", 3) @@ -203,7 +205,6 @@ async def get_validate(self): return None - class MessageQueueClient: """Asyncio-compatible MessageQueue client for communicating with MessageQueue Actor""" diff --git a/tests/special_e2e/run_fully_async_policy.sh b/tests/special_e2e/run_fully_async_policy.sh index 1a47b0fd06e..64f9fa82825 100644 --- a/tests/special_e2e/run_fully_async_policy.sh +++ b/tests/special_e2e/run_fully_async_policy.sh @@ -55,7 +55,7 @@ n_gpus_training=$((NUM_GPUS - n_gpus_rollout)) train_prompt_bsz=0 gen_prompt_bsz=1 n_resp_per_prompt=16 -train_prompt_mini_bsz=4 +train_prompt_mini_bsz=32 total_rollout_steps=$(((128*2))) test_freq=2 staleness_threshold=1 @@ -79,6 +79,7 @@ common_params=( data.gen_batch_size=${gen_prompt_bsz} data.return_raw_chat=${return_raw_chat} actor_rollout_ref.rollout.n=${n_resp_per_prompt} + actor_rollout_ref.rollout.calculate_log_probs=True algorithm.adv_estimator=${adv_estimator} algorithm.use_kl_in_reward=${use_kl_in_reward} algorithm.kl_ctrl.kl_coef=${kl_coef} diff --git a/verl/experimental/agent_loop/agent_loop.py b/verl/experimental/agent_loop/agent_loop.py index e9383b109e5..8c49390f456 100644 --- a/verl/experimental/agent_loop/agent_loop.py +++ b/verl/experimental/agent_loop/agent_loop.py @@ -16,9 +16,8 @@ import logging import os import random -import time from abc import ABC, abstractmethod -from typing import Any, Optional, List +from typing import Any, Optional import hydra import numpy as np @@ -105,6 +104,7 @@ async def generate( return output async def generate_for_partial(self, request_id, prompt_ids, sampling_params): + """Generate tokens from prompt ids. with partial rollout function""" server = self._choose_server(request_id) output = await server.generate_for_partial.remote( request_id=request_id, @@ -136,6 +136,8 @@ class AgentLoopOutput(BaseModel): """Auxiliary performance metrics""" is_cancel: bool = False """Indicates whether the request was interrupted""" + log_probs: list[float] = None + """Response token log probs including LLM generated token, tool response token.""" # make hydra.utils.instantiate happy @@ -383,8 +385,7 @@ async def generate_sequences(self, batch: DataProto) -> DataProto: return output async def generate_sequences_no_post( - self, - batch: DataProto, partial_output_list: Optional[List[AgentLoopOutput]] + self, batch: DataProto, partial_output_list: Optional[list[AgentLoopOutput]] ) -> list[AgentLoopOutput]: """Generate sequences from agent loop. @@ -431,11 +432,9 @@ async def generate_sequences_no_post( if not partial_output_list: partial_output_list = [None] * len(batch) - for agent_name, messages, trajectory, partial_output in zip(agent_names, - raw_prompts, - trajectory_info, - partial_output_list, - strict=True): + for agent_name, messages, trajectory, partial_output in zip( + agent_names, raw_prompts, trajectory_info, partial_output_list, strict=True + ): tasks.append( asyncio.create_task( self._run_agent_loop(agent_name, messages.tolist(), sampling_params, trajectory, partial_output) @@ -608,10 +607,11 @@ def generate_sequences(self, prompts: DataProto) -> DataProto: output.meta_info = {"timing": timing} return output - async def generate_single_sample_async(self, - sample: DataProto, - partial_output_list: Optional[List[AgentLoopOutput]], - ) -> List[AgentLoopOutput]: + async def generate_single_sample_async( + self, + sample: DataProto, + partial_output_list: Optional[list[AgentLoopOutput]], + ) -> list[AgentLoopOutput]: """ 异步处理单个样本, 需要复制n次 diff --git a/verl/experimental/agent_loop/partial_single_turn_agent_loop.py b/verl/experimental/agent_loop/partial_single_turn_agent_loop.py index 899b83f1866..df4a4f3350a 100644 --- a/verl/experimental/agent_loop/partial_single_turn_agent_loop.py +++ b/verl/experimental/agent_loop/partial_single_turn_agent_loop.py @@ -35,7 +35,6 @@ def __init__(self, *args, **kwargs): async def run( self, messages: list[dict[str, Any]], sampling_params: dict[str, Any], output: Optional[AgentLoopOutput] ) -> AgentLoopOutput: - if not output: prompt_ids = await self.loop.run_in_executor( None, lambda: self.tokenizer.apply_chat_template(messages, add_generation_prompt=True, tokenize=True) @@ -51,7 +50,7 @@ async def run( metrics = {} request_id = uuid4().hex with simple_timer("generate_sequences", metrics): - response_ids, is_cancel = await self.server_manager.generate_for_partial( + response_ids, log_probs, is_cancel = await self.server_manager.generate_for_partial( request_id=request_id, prompt_ids=prompt_ids, sampling_params=sampling_params ) @@ -60,6 +59,7 @@ async def run( # 暂停待恢复样本, 把输出结果加到 response_ids 后,并重置 response_mask else: prompt_ids = output.prompt_ids + log_probs = output.log_probs + log_probs response_ids = output.response_ids + response_ids response_mask = [1] * len(response_ids) @@ -70,4 +70,5 @@ async def run( num_turns=2, metrics=metrics, is_cancel=is_cancel, + log_probs=log_probs, ) diff --git a/verl/experimental/agent_loop/single_turn_agent_loop.py b/verl/experimental/agent_loop/single_turn_agent_loop.py index 492c1894cc5..df6e1991888 100644 --- a/verl/experimental/agent_loop/single_turn_agent_loop.py +++ b/verl/experimental/agent_loop/single_turn_agent_loop.py @@ -32,9 +32,9 @@ def __init__(self, *args, **kwargs): self.prompt_length = self.config.actor_rollout_ref.rollout.prompt_length self.response_length = self.config.actor_rollout_ref.rollout.response_length - async def run(self, messages: list[dict[str, Any]], - sampling_params: dict[str, Any], - output: Optional[AgentLoopOutput]) -> AgentLoopOutput: + async def run( + self, messages: list[dict[str, Any]], sampling_params: dict[str, Any], output: Optional[AgentLoopOutput] + ) -> AgentLoopOutput: metrics = {} request_id = uuid4().hex prompt_ids = await self.loop.run_in_executor( diff --git a/verl/experimental/agent_loop/tool_agent_loop.py b/verl/experimental/agent_loop/tool_agent_loop.py index a0642048dc7..7c945b7d4c9 100644 --- a/verl/experimental/agent_loop/tool_agent_loop.py +++ b/verl/experimental/agent_loop/tool_agent_loop.py @@ -56,10 +56,9 @@ def init_class(cls, config, tokenizer, **kwargs): cls.system_prompt = tokenizer.apply_chat_template([{}], add_generation_prompt=False, tokenize=True) @rollout_trace_op - async def run(self, - messages: list[dict[str, Any]], - sampling_params: dict[str, Any], - output: Optional[AgentLoopOutput]) -> AgentLoopOutput: + async def run( + self, messages: list[dict[str, Any]], sampling_params: dict[str, Any], output: Optional[AgentLoopOutput] + ) -> AgentLoopOutput: metrics = {} request_id = uuid4().hex prompt_ids = await self.loop.run_in_executor( diff --git a/verl/trainer/main_ppo.py b/verl/trainer/main_ppo.py index 7b34cbfaf23..4b240c6ffbf 100644 --- a/verl/trainer/main_ppo.py +++ b/verl/trainer/main_ppo.py @@ -38,6 +38,7 @@ def main(config): config_dict: Hydra configuration dictionary containing training parameters. """ from time import time + start_time = time() run_ppo(config) print(f"total time: {time() - start_time:.2f} seconds") diff --git a/verl/trainer/ppo/ray_trainer.py b/verl/trainer/ppo/ray_trainer.py index 60621021b30..e61b1dc5fe0 100644 --- a/verl/trainer/ppo/ray_trainer.py +++ b/verl/trainer/ppo/ray_trainer.py @@ -1280,6 +1280,10 @@ def _process_batch_common(self, batch, metrics, timing_raw): "training/rollout_probs_diff_std": rollout_probs_diff_std.detach().item(), } ) + if self.config.async_training and self.config.async_training.use_rollout_log_probs: + batch.batch["old_log_probs"] = batch.batch["rollout_log_probs"] + del actor_old_log_probs + if self.use_reference_policy: # compute reference log_prob with marked_timer("ref", timing_raw, color="olive"): diff --git a/verl/workers/rollout/vllm_rollout/vllm_async_server.py b/verl/workers/rollout/vllm_rollout/vllm_async_server.py index 7ce640e33cb..3b3e9542252 100644 --- a/verl/workers/rollout/vllm_rollout/vllm_async_server.py +++ b/verl/workers/rollout/vllm_rollout/vllm_async_server.py @@ -15,16 +15,14 @@ import logging import os import pickle -from contextlib import ExitStack -from typing import Any, Callable, Optional, Coroutine, Sequence +from typing import Any, Callable, Optional, Sequence import ray import zmq -from omegaconf import DictConfig, ListConfig +from omegaconf import DictConfig from starlette.requests import Request from starlette.responses import JSONResponse, StreamingResponse -from vllm import SamplingParams, RequestOutput -from vllm.config import CompilationConfig, CompilationLevel +from vllm import SamplingParams from vllm.engine.arg_utils import AsyncEngineArgs from vllm.entrypoints.logger import RequestLogger from vllm.entrypoints.openai.protocol import ChatCompletionRequest, ChatCompletionResponse, ErrorResponse @@ -337,7 +335,7 @@ async def generate(self, prompt_ids: list[int], sampling_params: dict[str, Any], async def _generate_step(self, prompt_ids: list[int], sampling_params: dict[str, Any], request_id: str): max_tokens = self.max_model_len - len(prompt_ids) - sampling_params = SamplingParams(max_tokens=max_tokens, **sampling_params) + sampling_params = SamplingParams(max_tokens=max_tokens, logprobs=1, **sampling_params) prompt = TokensPrompt(prompt_token_ids=prompt_ids) generator = self.engine.generate(prompt=prompt, sampling_params=sampling_params, request_id=request_id) @@ -349,12 +347,12 @@ async def _generate_step(self, prompt_ids: list[int], sampling_params: dict[str, async def generate_for_partial( self, prompt_ids: list[int], sampling_params: dict[str, Any], request_id: str - ) -> tuple[Sequence[int], bool] | tuple[str, bool]: + ) -> tuple[list[Any], list[Any], bool] | tuple[Sequence[int], list[float], Any]: # 设置中断标志 async with self.lock: if self.paused: # cancel 后, 所有任务直接返回,等待下次提交 - return [], True + return [], [], True self.cancel_event[request_id] = asyncio.Event() cancel_handle = asyncio.create_task(self.cancel_event[request_id].wait()) generation_handle = asyncio.create_task(self._generate_step(prompt_ids, sampling_params, request_id)) @@ -369,10 +367,15 @@ async def generate_for_partial( async with self.lock: token_ids = self.req_output[request_id].outputs[0].token_ids + log_probs: list[float] = [] + for i, x in enumerate(self.req_output[request_id].outputs[0].logprobs): + # sampling_params 中 logprobs 设置为1,只返回1个 + token_id = self.req_output[request_id].outputs[0].token_ids[i] + log_probs.append(x[token_id].logprob) is_cancel = generation_handle not in done self.cancel_event.pop(request_id, None) self.req_output.pop(request_id, None) - return token_ids, is_cancel + return token_ids, log_probs, is_cancel async def cancel(self): async with self.lock: From fdd8af0fc9bc041044bb37e83787947b5c14c694 Mon Sep 17 00:00:00 2001 From: ArronHZG Date: Sun, 31 Aug 2025 23:59:54 +0800 Subject: [PATCH 086/182] batch.meta_info.items() --- recipe/fully_async_policy/fully_async_trainer.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/recipe/fully_async_policy/fully_async_trainer.py b/recipe/fully_async_policy/fully_async_trainer.py index af406623145..d9258a1c935 100644 --- a/recipe/fully_async_policy/fully_async_trainer.py +++ b/recipe/fully_async_policy/fully_async_trainer.py @@ -285,7 +285,7 @@ def fit(self): "fully_async/current_param_version": self.current_param_version, } ) - for key, value in batch.meta_info: + for key, value in batch.meta_info.items(): if key.startswith("fully_async"): metrics[key] = value @@ -320,3 +320,5 @@ def _trigger_parameter_sync_after_step(self): else: self.local_trigger_step += 1 return + + From 9444d19e3cc023b404701345ab08705fe7b8cc6f Mon Sep 17 00:00:00 2001 From: ArronHZG Date: Mon, 1 Sep 2025 00:40:12 +0800 Subject: [PATCH 087/182] total wait time --- recipe/fully_async_policy/fully_async_trainer.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/recipe/fully_async_policy/fully_async_trainer.py b/recipe/fully_async_policy/fully_async_trainer.py index 6d74ee215f8..8340b22f6c6 100644 --- a/recipe/fully_async_policy/fully_async_trainer.py +++ b/recipe/fully_async_policy/fully_async_trainer.py @@ -182,20 +182,21 @@ def _get_samples_from_queue(self) -> tuple[None, None] | tuple[int, Any]: if not queue_samples or len(queue_samples) < self.required_samples: print("[FullyAsyncTrainer] not enough samples collected after loop") return None, None + total_wait_time = consumer_end - consumer_start print( f"[FullyAsyncTrainer] Loop collection completed: {len(queue_samples)}/{self.required_samples} samples, " - f"total wait time: {consumer_end - consumer_start:.2f} seconds" + f"total wait time: {total_wait_time:.2f} seconds" ) queue_samples = [ray.cloudpickle.loads(x) for x in queue_samples] - # print(queue_samples) # Assemble batch - now working directly with RolloutSample objects if self.config.trainer.balance_batch: batch = assemble_batch_from_rollout_samples(queue_samples, self.tokenizer, self.config, self._balance_batch) else: batch = assemble_batch_from_rollout_samples(queue_samples, self.tokenizer, self.config, None) - # print(f" _assemble_gen_batch_output_from_queue_samples {batch}") + + batch.meta_info["fully_async/total_wait_time"] = total_wait_time return 0, batch def _create_actor_rollout_classes(self): From 69c2427a04341158b5a2a22d15a4084092c18811 Mon Sep 17 00:00:00 2001 From: ArronHZG Date: Mon, 1 Sep 2025 17:03:31 +0800 Subject: [PATCH 088/182] from .detach_sharding_manager import DetachShardingManager --- recipe/one_step_off_policy/megatron_workers.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/recipe/one_step_off_policy/megatron_workers.py b/recipe/one_step_off_policy/megatron_workers.py index 5b338c5be42..a9318b8f7b3 100644 --- a/recipe/one_step_off_policy/megatron_workers.py +++ b/recipe/one_step_off_policy/megatron_workers.py @@ -168,7 +168,7 @@ def init_model(self): ) log_gpu_memory_usage("After building vllm rollout", logger=logger) - from sharding_manager import DetachShardingManager + from .detach_sharding_manager import DetachShardingManager rollout_sharding_manager = DetachShardingManager( inference_engine=rollout.inference_engine, device_mesh=rollout_device_mesh From 237d766739fa8f5736319132334eab30b8b54530 Mon Sep 17 00:00:00 2001 From: wangshulin02 Date: Mon, 1 Sep 2025 21:36:43 +0800 Subject: [PATCH 089/182] fix validate frequent bug & add final validate --- .../config/fully_async_ppo_trainer.yaml | 1 + .../dapo_7b_math_fsdp2_2_6.sh | 4 +-- .../dapo_7b_math_fsdp2_8_8.sh | 2 +- recipe/fully_async_policy/fully_async_main.py | 4 ++- .../fully_async_rollouter.py | 22 ++++++------ .../fully_async_policy/fully_async_trainer.py | 36 +++++++++++++------ recipe/fully_async_policy/message_queue.py | 20 +++++++++++ recipe/fully_async_policy/param_sync.py | 14 ++++---- tests/special_e2e/run_fully_async_policy.sh | 2 +- 9 files changed, 72 insertions(+), 33 deletions(-) diff --git a/recipe/fully_async_policy/config/fully_async_ppo_trainer.yaml b/recipe/fully_async_policy/config/fully_async_ppo_trainer.yaml index 3334ee4f4d5..c1f94b56b6b 100644 --- a/recipe/fully_async_policy/config/fully_async_ppo_trainer.yaml +++ b/recipe/fully_async_policy/config/fully_async_ppo_trainer.yaml @@ -22,6 +22,7 @@ rollout: n: 4 # 每个prompt生成的响应数量 total_rollout_steps: 100 total_epochs: 10 + test_freq: 1 # 测试频率, 每多少次参数更新后进行一次测试 data: gen_batch_size: 32 diff --git a/recipe/fully_async_policy/dapo_7b_math_fsdp2_2_6.sh b/recipe/fully_async_policy/dapo_7b_math_fsdp2_2_6.sh index 58017f0123b..5f654227d15 100644 --- a/recipe/fully_async_policy/dapo_7b_math_fsdp2_2_6.sh +++ b/recipe/fully_async_policy/dapo_7b_math_fsdp2_2_6.sh @@ -77,7 +77,7 @@ gen_prompt_bsz=1 n_resp_per_prompt=16 train_prompt_mini_bsz=64 total_rollout_steps=$(((512*100))) -test_freq=5 +test_freq=2 staleness_threshold=1 trigger_parameter_sync_step=16 partial_rollout=True @@ -156,7 +156,6 @@ $PYTHON_INTERPRETER -m recipe.fully_async_policy.fully_async_main \ trainer.project_name="${project_name}" \ trainer.experiment_name="${exp_name}" \ trainer.val_before_train=True \ - trainer.test_freq="${test_freq}" \ trainer.save_freq=-1 \ trainer.default_local_dir="${CKPTS_DIR}" \ trainer.resume_mode=auto \ @@ -166,6 +165,7 @@ $PYTHON_INTERPRETER -m recipe.fully_async_policy.fully_async_main \ rollout.n_gpus_per_node="${n_gpus_rollout}" \ rollout.total_rollout_steps="${total_rollout_steps}" \ rollout.total_epochs=10 \ + rollout.test_freq="${test_freq}" \ async_training.staleness_threshold="${staleness_threshold}" \ async_training.trigger_parameter_sync_step="${trigger_parameter_sync_step}" \ async_training.partial_rollout="${partial_rollout}" diff --git a/recipe/fully_async_policy/dapo_7b_math_fsdp2_8_8.sh b/recipe/fully_async_policy/dapo_7b_math_fsdp2_8_8.sh index 52ee0136d5a..c65080ba548 100644 --- a/recipe/fully_async_policy/dapo_7b_math_fsdp2_8_8.sh +++ b/recipe/fully_async_policy/dapo_7b_math_fsdp2_8_8.sh @@ -155,7 +155,6 @@ $PYTHON_INTERPRETER -m recipe.fully_async_policy.fully_async_main \ trainer.project_name="${project_name}" \ trainer.experiment_name="${exp_name}" \ trainer.val_before_train=True \ - trainer.test_freq="${test_freq}" \ trainer.save_freq=-1 \ trainer.default_local_dir="${CKPTS_DIR}" \ trainer.resume_mode=auto \ @@ -165,6 +164,7 @@ $PYTHON_INTERPRETER -m recipe.fully_async_policy.fully_async_main \ rollout.n_gpus_per_node="${n_gpus_rollout}" \ rollout.total_rollout_steps="${total_rollout_steps}" \ rollout.total_epochs=10 \ + rollout.test_freq="${test_freq}" \ async_training.staleness_threshold="${staleness_threshold}" \ async_training.trigger_parameter_sync_step="${trigger_parameter_sync_step}" \ async_training.partial_rollout="${partial_rollout}" diff --git a/recipe/fully_async_policy/fully_async_main.py b/recipe/fully_async_policy/fully_async_main.py index 2b5663bd5ea..532a425d126 100644 --- a/recipe/fully_async_policy/fully_async_main.py +++ b/recipe/fully_async_policy/fully_async_main.py @@ -216,7 +216,9 @@ def _initialize_components(self, config) -> None: ) ray.get(self.components["trainer"].set_parameter_synchronizer.remote(param_synchronizer)) - ray.get(param_synchronizer.sync_weights.remote(0)) + # load checkpoint and sync parameter before doing anything + ray.get(self.components["trainer"].load_checkpoint.remote()) + ray.get(param_synchronizer.sync_weights.remote(version=0)) self.components["param_synchronizer"] = param_synchronizer print("[ASYNC MAIN] All components initialized successfully") diff --git a/recipe/fully_async_policy/fully_async_rollouter.py b/recipe/fully_async_policy/fully_async_rollouter.py index 44768cacb29..426a5cad430 100644 --- a/recipe/fully_async_policy/fully_async_rollouter.py +++ b/recipe/fully_async_policy/fully_async_rollouter.py @@ -186,7 +186,7 @@ def get_max_queue_size(self): def get_total_train_steps(self): return self.total_train_steps - async def update_param_version(self, version: int): + async def update_param_version(self, version: int, last_sync: bool = False): """Update current parameter version""" async with self.lock: old_version = self.current_param_version @@ -198,11 +198,13 @@ async def update_param_version(self, version: int): f"Parameter version updated from {old_version} to {version}" ) timing_raw = {} - is_last_step = self.global_steps >= self.total_training_steps if ( self.val_reward_fn is not None - and self.config.trainer.test_freq > 0 - and ((self.global_steps > 0 and self.global_steps % self.config.trainer.test_freq == 0) or is_last_step) + and self.config.rollout.test_freq > 0 + and self.current_param_version % self.config.rollout.test_freq == 0 # test_freq 表示每多少步参数更新测试一次 + and self.current_param_version > 0 # don't test here in the initial parameter sync + ) or ( + last_sync and self.val_reward_fn is not None ): with marked_timer("testing", timing_raw, color="green"): val_metrics: dict = self._validate() @@ -429,20 +431,18 @@ async def _streaming_generation_main(self): config=OmegaConf.to_container(self.config, resolve=True), ) - # load checkpoint before doing anything - self._load_checkpoint() # TODO: 检查是否需要 - # perform validation before training # currently, we only support validation using the reward_function. - async with self.lock: # TODO: 检查是否需要锁 + async with self.lock: if self.val_reward_fn is not None and self.config.trainer.get("val_before_train", True): - print("Initial validation metric") + print("[FullyAsyncRollouter] Initial validating before training...") val_metrics = self._validate() assert val_metrics, f"{val_metrics=}" pprint(f"[FullyAsyncRollouter] Initial validation metrics: {val_metrics}") data = ValidateMetrics(timing_raw={}, metrics=val_metrics) await self.message_queue_client.put_validate(ray.cloudpickle.dumps(data)) - if self.config.trainer.get("val_only", False): + if self.config.trainer.get("val_only", False): # TODO: 是否需要保留此功能 + return # we start from step 1 @@ -544,7 +544,7 @@ async def _async_monitor_loop(self): while True: async with self.lock: - if not self.running: + if not self.running and self.message_queue_client.is_training_ended(): break await asyncio.sleep(check_interval) # 定期打印统计信息 diff --git a/recipe/fully_async_policy/fully_async_trainer.py b/recipe/fully_async_policy/fully_async_trainer.py index b43b3d87297..5accc435422 100644 --- a/recipe/fully_async_policy/fully_async_trainer.py +++ b/recipe/fully_async_policy/fully_async_trainer.py @@ -252,8 +252,6 @@ def fit(self): config=OmegaConf.to_container(self.config, resolve=True), ) - # load checkpoint before doing anything - self._load_checkpoint() self.max_steps_duration = 0 # Use queue mode, no need for traditional dataloader iterator # Initialize to get the first batch of data @@ -307,19 +305,35 @@ def fit(self): self._trigger_parameter_sync_after_step() self.global_steps += 1 - def _trigger_parameter_sync_after_step(self): + # final parameter sync and validate + self._trigger_parameter_sync_after_step(last_sync=True) + val_data = self.message_queue_client.get_validate_sync() + + if val_data: + val_data: ValidateMetrics = ray.cloudpickle.loads(val_data) + from pprint import pprint + pprint(f"[FullyAsyncTrainer] Final validation metrics: {val_data.metrics}") + # TODO: 是否需要计入log + + print("[FullyAsyncTrainer] Training completed, sending end signal...,sleeping") + time.sleep(10) + self.message_queue_client.set_training_end() + print("[FullyAsyncTrainer] End signal sent") + + self._check_save_checkpoint(True, timing_raw) # TODO: 检查checkpoint + + def load_checkpoint(self): + return self._load_checkpoint() + + def _trigger_parameter_sync_after_step(self, last_sync: bool = False): """ Trigger parameter synchronization after training step This ensures rollouter always uses the latest trained parameters """ - if self.local_trigger_step >= self.trigger_parameter_sync_step: - self.local_trigger_step = 1 - self.current_param_version = self.current_param_version + 1 - ray.get(self.param_synchronizer.sync_weights.remote(self.current_param_version)) - self.progress_bar.update(1) - return - else: + if self.local_trigger_step < self.trigger_parameter_sync_step and not last_sync: self.local_trigger_step += 1 return - + self.current_param_version += 1 + self.local_trigger_step = 1 + ray.get(self.param_synchronizer.sync_weights.remote(self.current_param_version, last_sync=last_sync)) \ No newline at end of file diff --git a/recipe/fully_async_policy/message_queue.py b/recipe/fully_async_policy/message_queue.py index 13e1a3e21e4..5aecba389f7 100644 --- a/recipe/fully_async_policy/message_queue.py +++ b/recipe/fully_async_policy/message_queue.py @@ -51,6 +51,9 @@ def __init__(self, config: DictConfig, max_queue_size: int = 1000): # Asyncio for message handling self.running = True + + # trainer end signal + self.training_ended = False # async safe - 在第一次使用时初始化 self._lock = asyncio.Lock() @@ -204,6 +207,15 @@ async def get_validate(self): else: return None + async def set_training_end(self): + """set training end signal""" + async with self._lock: + self.training_ended = True + + async def is_training_ended(self): + """check training end signal""" + async with self._lock: + return self.training_ended class MessageQueueClient: """Asyncio-compatible MessageQueue client for communicating with MessageQueue Actor""" @@ -269,3 +281,11 @@ def get_statistics_sync(self) -> dict[str, Any]: def update_param_version_sync(self, version: int): """Update parameter version (async)""" return ray.get(self.queue_actor.update_param_version.remote(version)) + + def set_training_end(self): + """Notify the end of training""" + return ray.get(self.queue_actor.set_training_end.remote()) + + def is_training_ended(self): + """Check if training is finished""" + return ray.get(self.queue_actor.is_training_ended.remote()) \ No newline at end of file diff --git a/recipe/fully_async_policy/param_sync.py b/recipe/fully_async_policy/param_sync.py index 7e75865ebd5..ad8dfed56b5 100644 --- a/recipe/fully_async_policy/param_sync.py +++ b/recipe/fully_async_policy/param_sync.py @@ -71,7 +71,7 @@ def _init_sync_group(self): group_name=self.sync_group_name, ) - def sync_weights(self, version): + def sync_weights(self, version, last_sync = False): start_time = time.time() self.current_version = version @@ -85,11 +85,13 @@ def sync_weights(self, version): # sync weights self.actor_wg.sync_rollout_weights() ray.get(self.rollout_wg.sync_rollout_weights()) + end_time = time.time() + print(f"[ParameterSynchronizer] sync_weights success. cost {end_time - start_time:.2f} seconds") - # Async Update rollout version - self.rollouter.update_param_version.remote(version) - + # Async Update rollout version & validation + self.rollouter.update_param_version.remote(version, last_sync) ray.get(self.rollouter.resume.remote()) - end_time = time.time() - print(f"[ParameterSynchronizer] sync_weights success. cost {end_time - start_time:.2f} seconds") + print(f"[ParameterSynchronizer] Update rollout version & validation done. cost {time.time() - end_time:.2f} seconds") + + diff --git a/tests/special_e2e/run_fully_async_policy.sh b/tests/special_e2e/run_fully_async_policy.sh index 64f9fa82825..51691a8800f 100644 --- a/tests/special_e2e/run_fully_async_policy.sh +++ b/tests/special_e2e/run_fully_async_policy.sh @@ -119,7 +119,6 @@ common_params=( trainer.project_name='verl-test-fully-async' trainer.experiment_name="${exp_name}" trainer.val_before_train=True - trainer.test_freq=-1 trainer.save_freq=-1 trainer.resume_mode=disable trainer.nnodes=1 @@ -128,6 +127,7 @@ common_params=( rollout.n_gpus_per_node=${n_gpus_rollout} rollout.total_rollout_steps=${total_rollout_steps} rollout.total_epochs=2 + rollout.test_freq=${test_freq} # Fully async specific configurations async_training.staleness_threshold=${staleness_threshold} async_training.partial_rollout="${partial_rollout}" From 66cc990edaff02c0e5bf422e38b757ec5cb9f450 Mon Sep 17 00:00:00 2001 From: wangshulin02 Date: Tue, 2 Sep 2025 15:29:13 +0800 Subject: [PATCH 090/182] remove unnecessary code, fix validate logic --- recipe/fully_async_policy/detach_utils.py | 4 +- recipe/fully_async_policy/fully_async_main.py | 4 +- .../fully_async_rollouter.py | 34 +++--------- .../fully_async_policy/fully_async_trainer.py | 53 ++++++++++--------- recipe/fully_async_policy/message_queue.py | 22 +------- recipe/fully_async_policy/param_sync.py | 7 +-- 6 files changed, 47 insertions(+), 77 deletions(-) diff --git a/recipe/fully_async_policy/detach_utils.py b/recipe/fully_async_policy/detach_utils.py index 79df36652f7..c28cbf9e631 100644 --- a/recipe/fully_async_policy/detach_utils.py +++ b/recipe/fully_async_policy/detach_utils.py @@ -13,7 +13,7 @@ # limitations under the License. import time from dataclasses import dataclass -from typing import Any +from typing import Any, Optional import numpy as np import torch @@ -52,6 +52,8 @@ class RolloutSample: class ValidateMetrics: timing_raw: dict[str, Any] metrics: dict[str, Any] + global_steps: Optional[int] = None + param_version: Optional[int] = None def prepare_single_generation_data(batch_dict, global_steps, rollout_n) -> DataProto: diff --git a/recipe/fully_async_policy/fully_async_main.py b/recipe/fully_async_policy/fully_async_main.py index 532a425d126..09961c85391 100644 --- a/recipe/fully_async_policy/fully_async_main.py +++ b/recipe/fully_async_policy/fully_async_main.py @@ -217,8 +217,10 @@ def _initialize_components(self, config) -> None: ray.get(self.components["trainer"].set_parameter_synchronizer.remote(param_synchronizer)) # load checkpoint and sync parameter before doing anything + val_before_train = val_reward_fn is not None and config.trainer.get("val_before_train", True) ray.get(self.components["trainer"].load_checkpoint.remote()) - ray.get(param_synchronizer.sync_weights.remote(version=0)) + ray.get(param_synchronizer.sync_weights.remote(version=0, + validate=val_before_train)) self.components["param_synchronizer"] = param_synchronizer print("[ASYNC MAIN] All components initialized successfully") diff --git a/recipe/fully_async_policy/fully_async_rollouter.py b/recipe/fully_async_policy/fully_async_rollouter.py index 426a5cad430..4e7f911fb1e 100644 --- a/recipe/fully_async_policy/fully_async_rollouter.py +++ b/recipe/fully_async_policy/fully_async_rollouter.py @@ -186,7 +186,7 @@ def get_max_queue_size(self): def get_total_train_steps(self): return self.total_train_steps - async def update_param_version(self, version: int, last_sync: bool = False): + async def update_param_version(self, version: int, validate: bool = False, global_steps: int = 0): """Update current parameter version""" async with self.lock: old_version = self.current_param_version @@ -201,14 +201,16 @@ async def update_param_version(self, version: int, last_sync: bool = False): if ( self.val_reward_fn is not None and self.config.rollout.test_freq > 0 - and self.current_param_version % self.config.rollout.test_freq == 0 # test_freq 表示每多少步参数更新测试一次 + and self.current_param_version % self.config.rollout.test_freq == 0 and self.current_param_version > 0 # don't test here in the initial parameter sync ) or ( - last_sync and self.val_reward_fn is not None + validate and self.val_reward_fn is not None ): with marked_timer("testing", timing_raw, color="green"): val_metrics: dict = self._validate() - data = ValidateMetrics(timing_raw=timing_raw, metrics=val_metrics) + data = ValidateMetrics(timing_raw=timing_raw, + metrics=val_metrics, + global_steps=global_steps) await self.message_queue_client.put_validate(ray.cloudpickle.dumps(data)) def _validate_config(self): @@ -422,28 +424,6 @@ async def _consumer_worker(self): async def _streaming_generation_main(self): """流式处理的主入口方法,包含初始化和验证逻辑""" - from verl.utils.tracking import Tracking - - self.logger = Tracking( - project_name=self.config.trainer.project_name, - experiment_name=self.config.trainer.experiment_name, - default_backend=self.config.trainer.logger, - config=OmegaConf.to_container(self.config, resolve=True), - ) - - # perform validation before training - # currently, we only support validation using the reward_function. - async with self.lock: - if self.val_reward_fn is not None and self.config.trainer.get("val_before_train", True): - print("[FullyAsyncRollouter] Initial validating before training...") - val_metrics = self._validate() - assert val_metrics, f"{val_metrics=}" - pprint(f"[FullyAsyncRollouter] Initial validation metrics: {val_metrics}") - data = ValidateMetrics(timing_raw={}, metrics=val_metrics) - await self.message_queue_client.put_validate(ray.cloudpickle.dumps(data)) - if self.config.trainer.get("val_only", False): # TODO: 是否需要保留此功能 - - return # we start from step 1 self.global_steps += 1 @@ -544,7 +524,7 @@ async def _async_monitor_loop(self): while True: async with self.lock: - if not self.running and self.message_queue_client.is_training_ended(): + if not self.running: break await asyncio.sleep(check_interval) # 定期打印统计信息 diff --git a/recipe/fully_async_policy/fully_async_trainer.py b/recipe/fully_async_policy/fully_async_trainer.py index 5accc435422..17415f496d7 100644 --- a/recipe/fully_async_policy/fully_async_trainer.py +++ b/recipe/fully_async_policy/fully_async_trainer.py @@ -36,7 +36,7 @@ WorkerType, ) from verl.utils.debug import marked_timer - +from pprint import pprint @ray.remote(num_cpus=10) class FullyAsyncTrainer(RayPPOTrainer): @@ -253,18 +253,21 @@ def fit(self): ) self.max_steps_duration = 0 + + # get validate data before training + val_data = self.message_queue_client.get_validate_sync() + if val_data: + val_data: ValidateMetrics = ray.cloudpickle.loads(val_data) + logger.log(data=val_data.metrics, step=val_data.global_steps) + logger.log(data=val_data.timing_raw, step=val_data.global_steps) + pprint(f"[FullyAsyncTrainer] Initial validation metrics: {val_data.metrics}") + # Use queue mode, no need for traditional dataloader iterator # Initialize to get the first batch of data while True: metrics = {} timing_raw = {} - val_data = self.message_queue_client.get_validate_sync() - if val_data: - val_data: ValidateMetrics = ray.cloudpickle.loads(val_data) - metrics.update(val_data.metrics) - timing_raw.update(val_data.timing_raw) - with marked_timer("step", timing_raw): with marked_timer("gen", timing_raw, color="red"): epoch, batch = self._get_samples_from_queue() @@ -302,38 +305,40 @@ def fit(self): f"trigger_parameter_sync_step: {self.trigger_parameter_sync_step} " f"{time_str}" ) - self._trigger_parameter_sync_after_step() + self._trigger_parameter_sync_after_step(global_steps=self.global_steps) + val_data = self.message_queue_client.get_validate_sync() + if val_data: + val_data: ValidateMetrics = ray.cloudpickle.loads(val_data) + logger.log(data=val_data.metrics, step=val_data.global_steps) + logger.log(data=val_data.timing_raw, step=val_data.global_steps) self.global_steps += 1 # final parameter sync and validate - self._trigger_parameter_sync_after_step(last_sync=True) - val_data = self.message_queue_client.get_validate_sync() - - if val_data: - val_data: ValidateMetrics = ray.cloudpickle.loads(val_data) - from pprint import pprint - pprint(f"[FullyAsyncTrainer] Final validation metrics: {val_data.metrics}") - # TODO: 是否需要计入log - - print("[FullyAsyncTrainer] Training completed, sending end signal...,sleeping") - time.sleep(10) - self.message_queue_client.set_training_end() - print("[FullyAsyncTrainer] End signal sent") + if val_data is None: + self._trigger_parameter_sync_after_step(validate=True, global_steps=self.global_steps-1) + val_data = self.message_queue_client.get_validate_sync() + if val_data: + val_data: ValidateMetrics = ray.cloudpickle.loads(val_data) + logger.log(data=val_data.metrics, step=val_data.global_steps) + logger.log(data=val_data.timing_raw, step=val_data.global_steps) + pprint(f"[FullyAsyncTrainer] Final validation metrics: {val_data.metrics}") self._check_save_checkpoint(True, timing_raw) # TODO: 检查checkpoint def load_checkpoint(self): return self._load_checkpoint() - def _trigger_parameter_sync_after_step(self, last_sync: bool = False): + def _trigger_parameter_sync_after_step(self, validate: bool = False, global_steps: int = None): """ Trigger parameter synchronization after training step This ensures rollouter always uses the latest trained parameters """ - if self.local_trigger_step < self.trigger_parameter_sync_step and not last_sync: + if self.local_trigger_step < self.trigger_parameter_sync_step and not validate: self.local_trigger_step += 1 return self.current_param_version += 1 self.local_trigger_step = 1 - ray.get(self.param_synchronizer.sync_weights.remote(self.current_param_version, last_sync=last_sync)) \ No newline at end of file + ray.get(self.param_synchronizer.sync_weights.remote(self.current_param_version, + validate=validate, + global_steps=global_steps)) \ No newline at end of file diff --git a/recipe/fully_async_policy/message_queue.py b/recipe/fully_async_policy/message_queue.py index 5aecba389f7..0520ec98034 100644 --- a/recipe/fully_async_policy/message_queue.py +++ b/recipe/fully_async_policy/message_queue.py @@ -51,9 +51,6 @@ def __init__(self, config: DictConfig, max_queue_size: int = 1000): # Asyncio for message handling self.running = True - - # trainer end signal - self.training_ended = False # async safe - 在第一次使用时初始化 self._lock = asyncio.Lock() @@ -207,15 +204,6 @@ async def get_validate(self): else: return None - async def set_training_end(self): - """set training end signal""" - async with self._lock: - self.training_ended = True - - async def is_training_ended(self): - """check training end signal""" - async with self._lock: - return self.training_ended class MessageQueueClient: """Asyncio-compatible MessageQueue client for communicating with MessageQueue Actor""" @@ -280,12 +268,4 @@ def get_statistics_sync(self) -> dict[str, Any]: def update_param_version_sync(self, version: int): """Update parameter version (async)""" - return ray.get(self.queue_actor.update_param_version.remote(version)) - - def set_training_end(self): - """Notify the end of training""" - return ray.get(self.queue_actor.set_training_end.remote()) - - def is_training_ended(self): - """Check if training is finished""" - return ray.get(self.queue_actor.is_training_ended.remote()) \ No newline at end of file + return ray.get(self.queue_actor.update_param_version.remote(version)) \ No newline at end of file diff --git a/recipe/fully_async_policy/param_sync.py b/recipe/fully_async_policy/param_sync.py index ad8dfed56b5..4cf39e5355b 100644 --- a/recipe/fully_async_policy/param_sync.py +++ b/recipe/fully_async_policy/param_sync.py @@ -71,7 +71,7 @@ def _init_sync_group(self): group_name=self.sync_group_name, ) - def sync_weights(self, version, last_sync = False): + def sync_weights(self, version, validate = False, global_steps = 0): start_time = time.time() self.current_version = version @@ -89,9 +89,10 @@ def sync_weights(self, version, last_sync = False): print(f"[ParameterSynchronizer] sync_weights success. cost {end_time - start_time:.2f} seconds") # Async Update rollout version & validation - self.rollouter.update_param_version.remote(version, last_sync) + self.rollouter.update_param_version.remote(version, validate, global_steps) ray.get(self.rollouter.resume.remote()) - print(f"[ParameterSynchronizer] Update rollout version & validation done. cost {time.time() - end_time:.2f} seconds") + print(f"[ParameterSynchronizer] Update rollout version & validation done. \ + cost {time.time() - end_time:.2f} seconds") From b405c6646438831de9824f3857bef80eae192d79 Mon Sep 17 00:00:00 2001 From: arron Date: Tue, 2 Sep 2025 18:11:53 +0800 Subject: [PATCH 091/182] 8_8 --- recipe/fully_async_policy/dapo_7b_math_fsdp2_8_8.sh | 3 ++- recipe/fully_async_policy/runtime_env.yaml | 5 ++++- 2 files changed, 6 insertions(+), 2 deletions(-) diff --git a/recipe/fully_async_policy/dapo_7b_math_fsdp2_8_8.sh b/recipe/fully_async_policy/dapo_7b_math_fsdp2_8_8.sh index 52ee0136d5a..61ec5d3c1e3 100644 --- a/recipe/fully_async_policy/dapo_7b_math_fsdp2_8_8.sh +++ b/recipe/fully_async_policy/dapo_7b_math_fsdp2_8_8.sh @@ -75,7 +75,7 @@ n_gpus_training=8 train_prompt_bsz=0 gen_prompt_bsz=1 n_resp_per_prompt=16 -train_prompt_mini_bsz=4 +train_prompt_mini_bsz=64 total_rollout_steps=$(((512*100))) test_freq=10 staleness_threshold=1 @@ -145,6 +145,7 @@ $PYTHON_INTERPRETER -m recipe.fully_async_policy.fully_async_main \ actor_rollout_ref.actor.fsdp_config.fsdp_size=${fsdp_size} \ actor_rollout_ref.rollout.name=${rollout_name} \ actor_rollout_ref.rollout.mode=${rollout_mode} \ + actor_rollout_ref.rollout.calculate_log_probs=True \ reward_model.reward_manager=dapo \ +reward_model.reward_kwargs.overlong_buffer_cfg.enable=${enable_overlong_buffer} \ +reward_model.reward_kwargs.overlong_buffer_cfg.len=${overlong_buffer_len} \ diff --git a/recipe/fully_async_policy/runtime_env.yaml b/recipe/fully_async_policy/runtime_env.yaml index 81c7c9f4265..5dcf269faa8 100644 --- a/recipe/fully_async_policy/runtime_env.yaml +++ b/recipe/fully_async_policy/runtime_env.yaml @@ -1,2 +1,5 @@ env_vars: - VLLM_USE_V1: "1" \ No newline at end of file + VLLM_USE_V1: "1" + TENSORBOARD_DIR: "/mnt/dolphinfs/hdd_pool/docker/user/hadoop-djst-algoplat/houzhenggang/tensorboard" + NCCL_DEBUG: "INFO" + HYDRA_FULL_ERROR: "1" \ No newline at end of file From 5919f05ace7357b2fdc8c6a36f0ee5530552dc63 Mon Sep 17 00:00:00 2001 From: wangshulin02 <953550366@qq.com> Date: Tue, 2 Sep 2025 19:21:04 +0800 Subject: [PATCH 092/182] fix trainer and rollouter validation asynchrony --- recipe/fully_async_policy/fully_async_rollouter.py | 8 ++++---- recipe/fully_async_policy/fully_async_trainer.py | 1 + recipe/fully_async_policy/param_sync.py | 14 +++++++++----- tests/special_e2e/run_fully_async_policy.sh | 2 +- 4 files changed, 15 insertions(+), 10 deletions(-) diff --git a/recipe/fully_async_policy/fully_async_rollouter.py b/recipe/fully_async_policy/fully_async_rollouter.py index 4e7f911fb1e..0bc871b6d7f 100644 --- a/recipe/fully_async_policy/fully_async_rollouter.py +++ b/recipe/fully_async_policy/fully_async_rollouter.py @@ -208,10 +208,10 @@ async def update_param_version(self, version: int, validate: bool = False, globa ): with marked_timer("testing", timing_raw, color="green"): val_metrics: dict = self._validate() - data = ValidateMetrics(timing_raw=timing_raw, - metrics=val_metrics, - global_steps=global_steps) - await self.message_queue_client.put_validate(ray.cloudpickle.dumps(data)) + data = ValidateMetrics(timing_raw=timing_raw, + metrics=val_metrics, + global_steps=global_steps) + await self.message_queue_client.put_validate(ray.cloudpickle.dumps(data)) def _validate_config(self): # Validate asynchronous training configuration diff --git a/recipe/fully_async_policy/fully_async_trainer.py b/recipe/fully_async_policy/fully_async_trainer.py index 17415f496d7..cca95efab66 100644 --- a/recipe/fully_async_policy/fully_async_trainer.py +++ b/recipe/fully_async_policy/fully_async_trainer.py @@ -339,6 +339,7 @@ def _trigger_parameter_sync_after_step(self, validate: bool = False, global_step self.current_param_version += 1 self.local_trigger_step = 1 + ray.get(self.param_synchronizer.wait_last_sync.remote()) ray.get(self.param_synchronizer.sync_weights.remote(self.current_param_version, validate=validate, global_steps=global_steps)) \ No newline at end of file diff --git a/recipe/fully_async_policy/param_sync.py b/recipe/fully_async_policy/param_sync.py index 4cf39e5355b..34fbca1c3e3 100644 --- a/recipe/fully_async_policy/param_sync.py +++ b/recipe/fully_async_policy/param_sync.py @@ -41,6 +41,7 @@ def __init__(self, config, trainer, rollouter, mq): self.weights_info = None self.sync_group_initialized = False self.sync_group_name = "actor_rollout" + self.wait_last = None # Statistics self.current_version = 0 @@ -90,9 +91,12 @@ def sync_weights(self, version, validate = False, global_steps = 0): # Async Update rollout version & validation self.rollouter.update_param_version.remote(version, validate, global_steps) - ray.get(self.rollouter.resume.remote()) - - print(f"[ParameterSynchronizer] Update rollout version & validation done. \ - cost {time.time() - end_time:.2f} seconds") - + self.wait_last = self.rollouter.resume.remote() + + def wait_last_sync(self): + print(f"[ParameterSynchronizer] waiting last parameter sync and validate...") + start_time = time.time() + if self.wait_last: + ray.get(self.wait_last) + print(f"[ParameterSynchronizer], cost: {time.time() - start_time:.2f} seconds") diff --git a/tests/special_e2e/run_fully_async_policy.sh b/tests/special_e2e/run_fully_async_policy.sh index 51691a8800f..142ee3e8806 100644 --- a/tests/special_e2e/run_fully_async_policy.sh +++ b/tests/special_e2e/run_fully_async_policy.sh @@ -57,7 +57,7 @@ gen_prompt_bsz=1 n_resp_per_prompt=16 train_prompt_mini_bsz=32 total_rollout_steps=$(((128*2))) -test_freq=2 +test_freq=10 staleness_threshold=1 trigger_parameter_sync_step=1 partial_rollout=True From cfa324919c4badbe580d6927e00fb66ac849c290 Mon Sep 17 00:00:00 2001 From: arron Date: Tue, 2 Sep 2025 22:14:36 +0800 Subject: [PATCH 093/182] TENSORBOARD_DIR --- .../dapo_7b_math_fsdp2_4_12.sh | 171 ++++++++++++++++++ recipe/fully_async_policy/runtime_env.yaml | 2 +- 2 files changed, 172 insertions(+), 1 deletion(-) create mode 100644 recipe/fully_async_policy/dapo_7b_math_fsdp2_4_12.sh diff --git a/recipe/fully_async_policy/dapo_7b_math_fsdp2_4_12.sh b/recipe/fully_async_policy/dapo_7b_math_fsdp2_4_12.sh new file mode 100644 index 00000000000..2b4bf9c31fe --- /dev/null +++ b/recipe/fully_async_policy/dapo_7b_math_fsdp2_4_12.sh @@ -0,0 +1,171 @@ +#!/usr/bin/env bash +set -xeuo pipefail + +project_name='DAPO' +exp_name='DAPO-Qwen2.5-7b-MATH-0527a1-fsdp2-fully-async-8-8' + +# Ray +# RAY_ADDRESS=${RAY_ADDRESS:-"http://localhost:8265"} +# WORKING_DIR=${WORKING_DIR:-"${PWD}"} +# RUNTIME_ENV=${RUNTIME_ENV:-"${WORKING_DIR}/verl/trainer/runtime_env.yaml"} +# Paths +RAY_DATA_HOME=${RAY_DATA_HOME:-"${HOME}/verl"} +# very important! please modify the max_position_embeddings in config.json to 32768 after downloading from huggingface +MODEL_PATH=${MODEL_PATH:-"${RAY_DATA_HOME}/models/Qwen2.5-Math-7B"} +CKPTS_DIR=${CKPTS_DIR:-"${RAY_DATA_HOME}/ckpts/${project_name}/${exp_name}"} +TRAIN_FILE=${TRAIN_FILE:-"${RAY_DATA_HOME}/data/dapo-math-17k.parquet"} +TEST_FILE=${TEST_FILE:-"${RAY_DATA_HOME}/data/aime-2024.parquet"} + +MODEL_PATH=/mnt/dolphinfs/hdd_pool/docker/user/hadoop-djst-algoplat/houzhenggang/modelscope/hub/models/Qwen/Qwen2___5-Math-7B +CKPTS_DIR=./ckpts/${project_name}/${exp_name} +TRAIN_FILE=/mnt/dolphinfs/hdd_pool/docker/user/hadoop-djst-algoplat/houzhenggang/data/dapo/dapo-math-17k.parquet +TEST_FILE=/mnt/dolphinfs/hdd_pool/docker/user/hadoop-djst-algoplat/houzhenggang/data/dapo/aime-2024.parquet + +rollout_mode="async" +rollout_name="vllm" # sglang or vllm +if [ "$rollout_mode" = "async" ]; then + export VLLM_USE_V1=1 + return_raw_chat="True" +fi + +# Algorithm parameters +adv_estimator=grpo + +use_kl_in_reward=False +kl_coef=0.0 +use_kl_loss=False +kl_loss_coef=0.0 + +clip_ratio_low=0.2 +clip_ratio_high=0.28 + +# Response length parameters +max_prompt_length=$((1024 * 2)) +max_response_length=$((1024 * 8)) +enable_overlong_buffer=True +overlong_buffer_len=$((1024 * 4)) +overlong_penalty_factor=1.0 + +# Training parameters +loss_agg_mode="token-mean" + +# Algorithm +temperature=1.0 +top_p=1.0 +top_k=-1 # 0 for HF rollout, -1 for vLLM rollout +val_top_p=0.7 + +# Performance Related Parameter +use_dynamic_bsz=True +actor_ppo_max_token_len=$(((max_prompt_length + max_response_length) * 2)) +infer_ppo_max_token_len=$(((max_prompt_length + max_response_length) * 3)) +ref_offload=True +actor_offload=False +gen_tp=1 +sp_size=1 +fsdp_size=2 + +# Fully async specific parameters +NNODES=${NNODES:-2} +NGPUS_PER_NODE=${NGPUS_PER_NODE:-8} + +n_gpus_rollout=2 +n_gpus_training=$((NGPUS_PER_NODE - n_gpus_rollout)) + +train_prompt_bsz=0 +gen_prompt_bsz=1 +n_resp_per_prompt=16 +train_prompt_mini_bsz=64 +total_rollout_steps=$(((512*100))) +test_freq=10 +staleness_threshold=1 +trigger_parameter_sync_step=16 +partial_rollout=True + +PYTHON_INTERPRETER="/home/hadoop-djst-algoplat/miniconda3/bin/python" +if [ ! -x "$PYTHON_INTERPRETER" ]; then + PYTHON_INTERPRETER="python3" +fi + +$PYTHON_INTERPRETER -m recipe.fully_async_policy.fully_async_main \ + data.train_files="${TRAIN_FILE}" \ + data.val_files="${TEST_FILE}" \ + data.prompt_key=prompt \ + data.truncation='left' \ + data.max_prompt_length=${max_prompt_length} \ + data.max_response_length=${max_response_length} \ + data.train_batch_size=${train_prompt_bsz} \ + data.gen_batch_size=${gen_prompt_bsz} \ + data.return_raw_chat=${return_raw_chat} \ + actor_rollout_ref.rollout.n=${n_resp_per_prompt} \ + algorithm.adv_estimator=${adv_estimator} \ + algorithm.use_kl_in_reward=${use_kl_in_reward} \ + algorithm.kl_ctrl.kl_coef=${kl_coef} \ + actor_rollout_ref.actor.strategy=fsdp2 \ + critic.strategy=fsdp2 \ + actor_rollout_ref.actor.use_kl_loss=${use_kl_loss} \ + actor_rollout_ref.actor.kl_loss_coef=${kl_loss_coef} \ + actor_rollout_ref.actor.clip_ratio_low=${clip_ratio_low} \ + actor_rollout_ref.actor.clip_ratio_high=${clip_ratio_high} \ + actor_rollout_ref.actor.clip_ratio_c=10.0 \ + actor_rollout_ref.model.use_remove_padding=True \ + actor_rollout_ref.hybrid_engine=False \ + +actor_rollout_ref.model.override_config.max_position_embeddings=32768 \ + actor_rollout_ref.actor.use_dynamic_bsz=${use_dynamic_bsz} \ + actor_rollout_ref.ref.log_prob_use_dynamic_bsz=${use_dynamic_bsz} \ + actor_rollout_ref.rollout.log_prob_use_dynamic_bsz=${use_dynamic_bsz} \ + actor_rollout_ref.actor.ppo_max_token_len_per_gpu=${actor_ppo_max_token_len} \ + actor_rollout_ref.ref.log_prob_max_token_len_per_gpu=${infer_ppo_max_token_len} \ + actor_rollout_ref.rollout.log_prob_max_token_len_per_gpu=${infer_ppo_max_token_len} \ + actor_rollout_ref.model.path="${MODEL_PATH}" \ + actor_rollout_ref.actor.optim.lr=1e-6 \ + actor_rollout_ref.actor.optim.lr_warmup_steps=10 \ + actor_rollout_ref.actor.optim.weight_decay=0.1 \ + actor_rollout_ref.actor.ppo_mini_batch_size=${train_prompt_mini_bsz} \ + actor_rollout_ref.actor.fsdp_config.param_offload=${actor_offload} \ + actor_rollout_ref.actor.fsdp_config.optimizer_offload=${actor_offload} \ + actor_rollout_ref.actor.entropy_coeff=0 \ + actor_rollout_ref.actor.grad_clip=1.0 \ + actor_rollout_ref.actor.loss_agg_mode=${loss_agg_mode} \ + actor_rollout_ref.actor.ulysses_sequence_parallel_size=${sp_size} \ + actor_rollout_ref.rollout.gpu_memory_utilization=0.80 \ + actor_rollout_ref.rollout.tensor_model_parallel_size=${gen_tp} \ + actor_rollout_ref.rollout.enable_chunked_prefill=True \ + actor_rollout_ref.rollout.max_num_batched_tokens=$((max_prompt_length + max_response_length)) \ + actor_rollout_ref.rollout.temperature=${temperature} \ + actor_rollout_ref.rollout.top_p=${top_p} \ + actor_rollout_ref.rollout.top_k=${top_k} \ + actor_rollout_ref.rollout.val_kwargs.temperature=${temperature} \ + actor_rollout_ref.rollout.val_kwargs.top_p=${val_top_p} \ + actor_rollout_ref.rollout.val_kwargs.top_k=${top_k} \ + actor_rollout_ref.rollout.val_kwargs.do_sample=True \ + actor_rollout_ref.rollout.val_kwargs.n=1 \ + actor_rollout_ref.ref.fsdp_config.param_offload=${ref_offload} \ + actor_rollout_ref.ref.ulysses_sequence_parallel_size=${sp_size} \ + actor_rollout_ref.actor.fsdp_config.fsdp_size=${fsdp_size} \ + actor_rollout_ref.rollout.name=${rollout_name} \ + actor_rollout_ref.rollout.mode=${rollout_mode} \ + actor_rollout_ref.rollout.calculate_log_probs=True \ + reward_model.reward_manager=dapo \ + +reward_model.reward_kwargs.overlong_buffer_cfg.enable=${enable_overlong_buffer} \ + +reward_model.reward_kwargs.overlong_buffer_cfg.len=${overlong_buffer_len} \ + +reward_model.reward_kwargs.overlong_buffer_cfg.penalty_factor=${overlong_penalty_factor} \ + +reward_model.reward_kwargs.overlong_buffer_cfg.log=False \ + +reward_model.reward_kwargs.max_resp_len=${max_response_length} \ + trainer.logger=['console','tensorboard'] \ + trainer.project_name="${project_name}" \ + trainer.experiment_name="${exp_name}" \ + trainer.val_before_train=True \ + trainer.test_freq="${test_freq}" \ + trainer.save_freq=-1 \ + trainer.default_local_dir="${CKPTS_DIR}" \ + trainer.resume_mode=auto \ + trainer.nnodes="${NNODES}" \ + trainer.n_gpus_per_node="${n_gpus_training}" \ + rollout.nnodes="${NNODES}" \ + rollout.n_gpus_per_node="${n_gpus_rollout}" \ + rollout.total_rollout_steps="${total_rollout_steps}" \ + rollout.total_epochs=10 \ + async_training.staleness_threshold="${staleness_threshold}" \ + async_training.trigger_parameter_sync_step="${trigger_parameter_sync_step}" \ + async_training.partial_rollout="${partial_rollout}" diff --git a/recipe/fully_async_policy/runtime_env.yaml b/recipe/fully_async_policy/runtime_env.yaml index 5dcf269faa8..dcca08e67f7 100644 --- a/recipe/fully_async_policy/runtime_env.yaml +++ b/recipe/fully_async_policy/runtime_env.yaml @@ -1,5 +1,5 @@ env_vars: VLLM_USE_V1: "1" - TENSORBOARD_DIR: "/mnt/dolphinfs/hdd_pool/docker/user/hadoop-djst-algoplat/houzhenggang/tensorboard" + TENSORBOARD_DIR: "/mnt/dolphinfs/hdd_pool/docker/user/hadoop-djst-algoplat/houzhenggang/tensorboard/dapo_7b_math_fsdp2_4_12" NCCL_DEBUG: "INFO" HYDRA_FULL_ERROR: "1" \ No newline at end of file From 54448199b22159f8823b3000d2517efb5241c369 Mon Sep 17 00:00:00 2001 From: wangshulin02 <953550366@qq.com> Date: Wed, 3 Sep 2025 00:12:42 +0800 Subject: [PATCH 094/182] simple implementation of Metrics Aggregator --- recipe/fully_async_policy/detach_utils.py | 216 +++++++++++++++++- .../fully_async_rollouter.py | 3 +- .../fully_async_policy/fully_async_trainer.py | 34 ++- recipe/fully_async_policy/runtime_env.yaml | 5 +- 4 files changed, 246 insertions(+), 12 deletions(-) diff --git a/recipe/fully_async_policy/detach_utils.py b/recipe/fully_async_policy/detach_utils.py index c28cbf9e631..48a41443612 100644 --- a/recipe/fully_async_policy/detach_utils.py +++ b/recipe/fully_async_policy/detach_utils.py @@ -13,7 +13,8 @@ # limitations under the License. import time from dataclasses import dataclass -from typing import Any, Optional +from typing import Any, Optional, Dict, List +from collections import defaultdict import numpy as np import torch @@ -229,3 +230,216 @@ def assemble_batch_from_rollout_samples( print(f"[BatchUtils] Batch assembly completed in {time.time() - start_time:.2f}s") return final_batch + +class MetricsAggregator: + """Metrics aggregator, used to combine metrics from multiple training steps""" + + def __init__(self): + # Store all values ​​for each metric + self.metric_values: Dict[str, List[float]] = defaultdict(list) + # Store the number of samples at each step for weighted averaging + self.sample_counts: List[int] = [] + # Store the timestamp of each step for time-related calculations + self.timestamps: List[float] = [] + # Step Count + self.step_count = 0 + + # Metric aggregation rule configuration + self.aggregation_rules = self._init_aggregation_rules() + + def _init_aggregation_rules(self) -> Dict[str, Dict[str, List[str]]]: + """Initialize metrics aggregation rules""" + return { + # # Cumulative metrics - take the last value + # 'last': [ + # 'fully_async/stale_samples_processed', + # 'fully_async/current_param_version', + # 'global_steps', + # 'epoch', + # ], + + # # Weighted average metrics - weighted by sample size + # 'weighted_avg': [ + # 'fully_async/stale_samples_ratio', + # 'policy_loss', + # 'value_loss', + # 'entropy_loss', + # 'kl_divergence', + # 'advantage_mean', + # 'advantage_std', + # 'learning_rate', + # ], + + # # Summation type metrics - direct accumulation + # 'sum': [ + # 'fully_async/total_wait_time', + # 'processed_samples', + # 'total_tokens', + # ], + + # Average metrics - Simple Average + # 'avg': [ + # 'perf/throughput', + # 'fully_async/avg_processing_time', + # 'fully_async/tp50_processing_time', + # 'fully_async/tp95_processing_time', + # 'fully_async/tp99_processing_time', + # 'grad_norm', + # ], + + # # Maximum value metrics + # 'max': [ + # 'fully_async/max_processing_time', + # 'max_grad_norm', + # 'peak_memory_usage', + # ], + + # # Minimum value metrics + # 'min': [ + # 'fully_async/min_processing_time', + # 'min_learning_rate', + # ], + + # Time-Based metrics - Special Treatment + 'time_sum': [ + 'timing_s/adv', + 'timing_s/gen', + 'timing_s/old_log_prob', + 'timing_s/reward', + 'timing_s/step', + 'timing_s/update_actor', + ], + } + + def add_step_metrics(self, metrics: Dict[str, Any], sample_count: int, timestamp: float = None): + """Adding a single-step metrics""" + if timestamp is None: + timestamp = time.time() + + self.sample_counts.append(sample_count) + self.timestamps.append(timestamp) + self.step_count += 1 + + # Store all metrics values + for key, value in metrics.items(): + if isinstance(value, (int, float, np.number)): + self.metric_values[key].append(float(value)) + elif isinstance(value, torch.Tensor): + self.metric_values[key].append(float(value.item())) + + def _get_aggregation_type(self, metric_name: str) -> str: + """Determine the aggregation type based on the metric name""" + for agg_type, metric_list in self.aggregation_rules.items(): + if metric_name in metric_list: + return agg_type + import warnings + warnings.warn(f"No aggregation rule is matched in init_aggregation_rules. \ + For metric {metric_name}, the 'last' method is used") + return 'last' + + # raise ValueError(f"No aggregation rule is matched in init_aggregation_rules. \ + # Metric name: {metric_name}") # TODO: 删除 + + + # Aggregation rules based on naming patterns + if metric_name.startswith('time/'): + aggregation_type = 'time_sum' + elif metric_name.endswith('_ratio') or metric_name.endswith('_rate'): + aggregation_type = 'weighted_avg' + elif metric_name.endswith('_count') or metric_name.endswith('_total'): + aggregation_type = 'sum' + elif metric_name.startswith('max_') or metric_name.endswith('_max'): + aggregation_type = 'max' + elif metric_name.startswith('min_') or metric_name.endswith('_min'): + aggregation_type = 'min' + else: + # The default is weighted average. + aggregation_type = 'weighted_avg' + import warnings + warnings.simplefilter("always", DeprecationWarning) + warnings.warn("No aggregation rule is matched in init_aggregation_rules. \ + Aggregation rule is matched based on name prefix:", aggregation_type) + return aggregation_type + + def _aggregate_single_metric(self, metric_name: str, values: List[float]) -> float: + """Aggregating a single metric""" + if not values: + return 0.0 + + agg_type = self._get_aggregation_type(metric_name) + + if agg_type == 'last': + return values[-1] + + elif agg_type == 'weighted_avg': + # Weighted average + if len(values) != len(self.sample_counts): + # If the lengths do not match, use a simple average + return sum(values) / len(values) + + total_samples = sum(self.sample_counts) + if total_samples == 0: + return sum(values) / len(values) + + weighted_sum = sum(v * c for v, c in zip(values, self.sample_counts)) + return weighted_sum / total_samples + + elif agg_type == 'sum' or agg_type == 'time_sum': + return sum(values) + + elif agg_type == 'avg': + return sum(values) / len(values) + + elif agg_type == 'max': + return max(values) + + elif agg_type == 'min': + return min(values) + + else: + # Default average + return sum(values) / len(values) + + def get_aggregated_metrics(self) -> Dict[str, Any]: + """aggregated metrics""" + if self.step_count == 0: + return {} + + aggregated = {} + + # Aggregate all metrics + for metric_name, values in self.metric_values.items(): + aggregated[metric_name] = self._aggregate_single_metric(metric_name, values) + + # # Adding aggregate statistics + # aggregated.update({ + # 'aggregation/step_count': self.step_count, + # 'aggregation/total_samples': sum(self.sample_counts), + # 'aggregation/avg_samples_per_step': sum(self.sample_counts) / self.step_count, + # 'aggregation/time_span': self.timestamps[-1] - self.timestamps[0] if len(self.timestamps) > 1 else 0, + # }) + + # # Add statistics on sample size + # if self.sample_counts: + # aggregated.update({ + # 'aggregation/min_samples_per_step': min(self.sample_counts), + # 'aggregation/max_samples_per_step': max(self.sample_counts), + # }) + + return aggregated + + def reset(self): + """Reset Aggregator""" + self.metric_values.clear() + self.sample_counts.clear() + self.timestamps.clear() + self.step_count = 0 + + def get_current_stats(self) -> Dict[str, Any]: + """Get statistics about the current aggregation state (for debugging)""" + return { + 'step_count': self.step_count, + 'metric_count': len(self.metric_values), + 'total_samples': sum(self.sample_counts), + 'metric_names': list(self.metric_values.keys()), + } diff --git a/recipe/fully_async_policy/fully_async_rollouter.py b/recipe/fully_async_policy/fully_async_rollouter.py index 0bc871b6d7f..bb7fcfa1889 100644 --- a/recipe/fully_async_policy/fully_async_rollouter.py +++ b/recipe/fully_async_policy/fully_async_rollouter.py @@ -210,7 +210,8 @@ async def update_param_version(self, version: int, validate: bool = False, globa val_metrics: dict = self._validate() data = ValidateMetrics(timing_raw=timing_raw, metrics=val_metrics, - global_steps=global_steps) + global_steps=global_steps, + param_version=version) await self.message_queue_client.put_validate(ray.cloudpickle.dumps(data)) def _validate_config(self): diff --git a/recipe/fully_async_policy/fully_async_trainer.py b/recipe/fully_async_policy/fully_async_trainer.py index cca95efab66..4d800af70a9 100644 --- a/recipe/fully_async_policy/fully_async_trainer.py +++ b/recipe/fully_async_policy/fully_async_trainer.py @@ -24,6 +24,7 @@ from recipe.fully_async_policy.detach_utils import ( ValidateMetrics, assemble_batch_from_rollout_samples, + MetricsAggregator, ) from recipe.fully_async_policy.message_queue import MessageQueueClient from verl.single_controller.ray import RayClassWithInitArgs, RayWorkerGroup @@ -120,6 +121,7 @@ def __init__( self.required_samples = int( self.minimal_bsz * config.actor_rollout_ref.actor.ppo_mini_batch_size / config.actor_rollout_ref.rollout.n ) + self.metrics_aggregator = MetricsAggregator() def set_message_queue_client(self, message_queue_client: MessageQueueClient): """Set message queue client""" @@ -245,7 +247,7 @@ def fit(self): from verl.utils.tracking import Tracking - logger = Tracking( + self.logger = Tracking( project_name=self.config.trainer.project_name, experiment_name=self.config.trainer.experiment_name, default_backend=self.config.trainer.logger, @@ -255,11 +257,13 @@ def fit(self): self.max_steps_duration = 0 # get validate data before training + if self.config.trainer.val_before_train and self.reward_fn is not None: + ray.get(self.param_synchronizer.wait_last_sync.remote()) val_data = self.message_queue_client.get_validate_sync() if val_data: val_data: ValidateMetrics = ray.cloudpickle.loads(val_data) - logger.log(data=val_data.metrics, step=val_data.global_steps) - logger.log(data=val_data.timing_raw, step=val_data.global_steps) + self.logger.log(data=val_data.metrics, step=val_data.param_version) + self.logger.log(data=val_data.timing_raw, step=val_data.param_version) pprint(f"[FullyAsyncTrainer] Initial validation metrics: {val_data.metrics}") # Use queue mode, no need for traditional dataloader iterator @@ -296,7 +300,11 @@ def fit(self): self._check_save_checkpoint(False, timing_raw) self._collect_metrics(batch, 0, metrics, timing_raw) - logger.log(data=metrics, step=self.global_steps) + self.metrics_aggregator.add_step_metrics( + metrics=metrics, + sample_count=self.required_samples, + timestamp=time.time() + ) # Trigger parameter synchronization after training step time_str = datetime.now().strftime("%H:%M:%S.%f")[:-3] print( @@ -309,8 +317,10 @@ def fit(self): val_data = self.message_queue_client.get_validate_sync() if val_data: val_data: ValidateMetrics = ray.cloudpickle.loads(val_data) - logger.log(data=val_data.metrics, step=val_data.global_steps) - logger.log(data=val_data.timing_raw, step=val_data.global_steps) + self.logger.log(data=val_data.metrics, step=val_data.param_version) + self.logger.log(data=val_data.timing_raw, step=val_data.param_version) + pprint(f"[FullyAsyncTrainer] parameter version: {val_data.param_version} \ + Validation metrics: {val_data.metrics}") self.global_steps += 1 # final parameter sync and validate @@ -319,8 +329,8 @@ def fit(self): val_data = self.message_queue_client.get_validate_sync() if val_data: val_data: ValidateMetrics = ray.cloudpickle.loads(val_data) - logger.log(data=val_data.metrics, step=val_data.global_steps) - logger.log(data=val_data.timing_raw, step=val_data.global_steps) + self.logger.log(data=val_data.metrics, step=val_data.param_version) + self.logger.log(data=val_data.timing_raw, step=val_data.param_version) pprint(f"[FullyAsyncTrainer] Final validation metrics: {val_data.metrics}") self._check_save_checkpoint(True, timing_raw) # TODO: 检查checkpoint @@ -339,7 +349,13 @@ def _trigger_parameter_sync_after_step(self, validate: bool = False, global_step self.current_param_version += 1 self.local_trigger_step = 1 + self.logger.log( + data=self.metrics_aggregator.get_aggregated_metrics(), + step=self.current_param_version, + ) + self.metrics_aggregator.reset() ray.get(self.param_synchronizer.wait_last_sync.remote()) ray.get(self.param_synchronizer.sync_weights.remote(self.current_param_version, validate=validate, - global_steps=global_steps)) \ No newline at end of file + global_steps=global_steps) + ) \ No newline at end of file diff --git a/recipe/fully_async_policy/runtime_env.yaml b/recipe/fully_async_policy/runtime_env.yaml index 81c7c9f4265..dcca08e67f7 100644 --- a/recipe/fully_async_policy/runtime_env.yaml +++ b/recipe/fully_async_policy/runtime_env.yaml @@ -1,2 +1,5 @@ env_vars: - VLLM_USE_V1: "1" \ No newline at end of file + VLLM_USE_V1: "1" + TENSORBOARD_DIR: "/mnt/dolphinfs/hdd_pool/docker/user/hadoop-djst-algoplat/houzhenggang/tensorboard/dapo_7b_math_fsdp2_4_12" + NCCL_DEBUG: "INFO" + HYDRA_FULL_ERROR: "1" \ No newline at end of file From 09b0e135d35454ac7a126b225a8c0adcccedc484 Mon Sep 17 00:00:00 2001 From: wangshulin02 <953550366@qq.com> Date: Wed, 3 Sep 2025 11:06:13 +0800 Subject: [PATCH 095/182] Merge branch 'recipe/async_policy' into recipe/fully_async_fix_0 --- .../dapo_7b_math_fsdp2_4_12.sh | 171 ++++++++++++++++++ .../dapo_7b_math_fsdp2_8_8.sh | 3 +- .../one_step_off_policy/megatron_workers.py | 2 +- 3 files changed, 174 insertions(+), 2 deletions(-) create mode 100644 recipe/fully_async_policy/dapo_7b_math_fsdp2_4_12.sh diff --git a/recipe/fully_async_policy/dapo_7b_math_fsdp2_4_12.sh b/recipe/fully_async_policy/dapo_7b_math_fsdp2_4_12.sh new file mode 100644 index 00000000000..2b4bf9c31fe --- /dev/null +++ b/recipe/fully_async_policy/dapo_7b_math_fsdp2_4_12.sh @@ -0,0 +1,171 @@ +#!/usr/bin/env bash +set -xeuo pipefail + +project_name='DAPO' +exp_name='DAPO-Qwen2.5-7b-MATH-0527a1-fsdp2-fully-async-8-8' + +# Ray +# RAY_ADDRESS=${RAY_ADDRESS:-"http://localhost:8265"} +# WORKING_DIR=${WORKING_DIR:-"${PWD}"} +# RUNTIME_ENV=${RUNTIME_ENV:-"${WORKING_DIR}/verl/trainer/runtime_env.yaml"} +# Paths +RAY_DATA_HOME=${RAY_DATA_HOME:-"${HOME}/verl"} +# very important! please modify the max_position_embeddings in config.json to 32768 after downloading from huggingface +MODEL_PATH=${MODEL_PATH:-"${RAY_DATA_HOME}/models/Qwen2.5-Math-7B"} +CKPTS_DIR=${CKPTS_DIR:-"${RAY_DATA_HOME}/ckpts/${project_name}/${exp_name}"} +TRAIN_FILE=${TRAIN_FILE:-"${RAY_DATA_HOME}/data/dapo-math-17k.parquet"} +TEST_FILE=${TEST_FILE:-"${RAY_DATA_HOME}/data/aime-2024.parquet"} + +MODEL_PATH=/mnt/dolphinfs/hdd_pool/docker/user/hadoop-djst-algoplat/houzhenggang/modelscope/hub/models/Qwen/Qwen2___5-Math-7B +CKPTS_DIR=./ckpts/${project_name}/${exp_name} +TRAIN_FILE=/mnt/dolphinfs/hdd_pool/docker/user/hadoop-djst-algoplat/houzhenggang/data/dapo/dapo-math-17k.parquet +TEST_FILE=/mnt/dolphinfs/hdd_pool/docker/user/hadoop-djst-algoplat/houzhenggang/data/dapo/aime-2024.parquet + +rollout_mode="async" +rollout_name="vllm" # sglang or vllm +if [ "$rollout_mode" = "async" ]; then + export VLLM_USE_V1=1 + return_raw_chat="True" +fi + +# Algorithm parameters +adv_estimator=grpo + +use_kl_in_reward=False +kl_coef=0.0 +use_kl_loss=False +kl_loss_coef=0.0 + +clip_ratio_low=0.2 +clip_ratio_high=0.28 + +# Response length parameters +max_prompt_length=$((1024 * 2)) +max_response_length=$((1024 * 8)) +enable_overlong_buffer=True +overlong_buffer_len=$((1024 * 4)) +overlong_penalty_factor=1.0 + +# Training parameters +loss_agg_mode="token-mean" + +# Algorithm +temperature=1.0 +top_p=1.0 +top_k=-1 # 0 for HF rollout, -1 for vLLM rollout +val_top_p=0.7 + +# Performance Related Parameter +use_dynamic_bsz=True +actor_ppo_max_token_len=$(((max_prompt_length + max_response_length) * 2)) +infer_ppo_max_token_len=$(((max_prompt_length + max_response_length) * 3)) +ref_offload=True +actor_offload=False +gen_tp=1 +sp_size=1 +fsdp_size=2 + +# Fully async specific parameters +NNODES=${NNODES:-2} +NGPUS_PER_NODE=${NGPUS_PER_NODE:-8} + +n_gpus_rollout=2 +n_gpus_training=$((NGPUS_PER_NODE - n_gpus_rollout)) + +train_prompt_bsz=0 +gen_prompt_bsz=1 +n_resp_per_prompt=16 +train_prompt_mini_bsz=64 +total_rollout_steps=$(((512*100))) +test_freq=10 +staleness_threshold=1 +trigger_parameter_sync_step=16 +partial_rollout=True + +PYTHON_INTERPRETER="/home/hadoop-djst-algoplat/miniconda3/bin/python" +if [ ! -x "$PYTHON_INTERPRETER" ]; then + PYTHON_INTERPRETER="python3" +fi + +$PYTHON_INTERPRETER -m recipe.fully_async_policy.fully_async_main \ + data.train_files="${TRAIN_FILE}" \ + data.val_files="${TEST_FILE}" \ + data.prompt_key=prompt \ + data.truncation='left' \ + data.max_prompt_length=${max_prompt_length} \ + data.max_response_length=${max_response_length} \ + data.train_batch_size=${train_prompt_bsz} \ + data.gen_batch_size=${gen_prompt_bsz} \ + data.return_raw_chat=${return_raw_chat} \ + actor_rollout_ref.rollout.n=${n_resp_per_prompt} \ + algorithm.adv_estimator=${adv_estimator} \ + algorithm.use_kl_in_reward=${use_kl_in_reward} \ + algorithm.kl_ctrl.kl_coef=${kl_coef} \ + actor_rollout_ref.actor.strategy=fsdp2 \ + critic.strategy=fsdp2 \ + actor_rollout_ref.actor.use_kl_loss=${use_kl_loss} \ + actor_rollout_ref.actor.kl_loss_coef=${kl_loss_coef} \ + actor_rollout_ref.actor.clip_ratio_low=${clip_ratio_low} \ + actor_rollout_ref.actor.clip_ratio_high=${clip_ratio_high} \ + actor_rollout_ref.actor.clip_ratio_c=10.0 \ + actor_rollout_ref.model.use_remove_padding=True \ + actor_rollout_ref.hybrid_engine=False \ + +actor_rollout_ref.model.override_config.max_position_embeddings=32768 \ + actor_rollout_ref.actor.use_dynamic_bsz=${use_dynamic_bsz} \ + actor_rollout_ref.ref.log_prob_use_dynamic_bsz=${use_dynamic_bsz} \ + actor_rollout_ref.rollout.log_prob_use_dynamic_bsz=${use_dynamic_bsz} \ + actor_rollout_ref.actor.ppo_max_token_len_per_gpu=${actor_ppo_max_token_len} \ + actor_rollout_ref.ref.log_prob_max_token_len_per_gpu=${infer_ppo_max_token_len} \ + actor_rollout_ref.rollout.log_prob_max_token_len_per_gpu=${infer_ppo_max_token_len} \ + actor_rollout_ref.model.path="${MODEL_PATH}" \ + actor_rollout_ref.actor.optim.lr=1e-6 \ + actor_rollout_ref.actor.optim.lr_warmup_steps=10 \ + actor_rollout_ref.actor.optim.weight_decay=0.1 \ + actor_rollout_ref.actor.ppo_mini_batch_size=${train_prompt_mini_bsz} \ + actor_rollout_ref.actor.fsdp_config.param_offload=${actor_offload} \ + actor_rollout_ref.actor.fsdp_config.optimizer_offload=${actor_offload} \ + actor_rollout_ref.actor.entropy_coeff=0 \ + actor_rollout_ref.actor.grad_clip=1.0 \ + actor_rollout_ref.actor.loss_agg_mode=${loss_agg_mode} \ + actor_rollout_ref.actor.ulysses_sequence_parallel_size=${sp_size} \ + actor_rollout_ref.rollout.gpu_memory_utilization=0.80 \ + actor_rollout_ref.rollout.tensor_model_parallel_size=${gen_tp} \ + actor_rollout_ref.rollout.enable_chunked_prefill=True \ + actor_rollout_ref.rollout.max_num_batched_tokens=$((max_prompt_length + max_response_length)) \ + actor_rollout_ref.rollout.temperature=${temperature} \ + actor_rollout_ref.rollout.top_p=${top_p} \ + actor_rollout_ref.rollout.top_k=${top_k} \ + actor_rollout_ref.rollout.val_kwargs.temperature=${temperature} \ + actor_rollout_ref.rollout.val_kwargs.top_p=${val_top_p} \ + actor_rollout_ref.rollout.val_kwargs.top_k=${top_k} \ + actor_rollout_ref.rollout.val_kwargs.do_sample=True \ + actor_rollout_ref.rollout.val_kwargs.n=1 \ + actor_rollout_ref.ref.fsdp_config.param_offload=${ref_offload} \ + actor_rollout_ref.ref.ulysses_sequence_parallel_size=${sp_size} \ + actor_rollout_ref.actor.fsdp_config.fsdp_size=${fsdp_size} \ + actor_rollout_ref.rollout.name=${rollout_name} \ + actor_rollout_ref.rollout.mode=${rollout_mode} \ + actor_rollout_ref.rollout.calculate_log_probs=True \ + reward_model.reward_manager=dapo \ + +reward_model.reward_kwargs.overlong_buffer_cfg.enable=${enable_overlong_buffer} \ + +reward_model.reward_kwargs.overlong_buffer_cfg.len=${overlong_buffer_len} \ + +reward_model.reward_kwargs.overlong_buffer_cfg.penalty_factor=${overlong_penalty_factor} \ + +reward_model.reward_kwargs.overlong_buffer_cfg.log=False \ + +reward_model.reward_kwargs.max_resp_len=${max_response_length} \ + trainer.logger=['console','tensorboard'] \ + trainer.project_name="${project_name}" \ + trainer.experiment_name="${exp_name}" \ + trainer.val_before_train=True \ + trainer.test_freq="${test_freq}" \ + trainer.save_freq=-1 \ + trainer.default_local_dir="${CKPTS_DIR}" \ + trainer.resume_mode=auto \ + trainer.nnodes="${NNODES}" \ + trainer.n_gpus_per_node="${n_gpus_training}" \ + rollout.nnodes="${NNODES}" \ + rollout.n_gpus_per_node="${n_gpus_rollout}" \ + rollout.total_rollout_steps="${total_rollout_steps}" \ + rollout.total_epochs=10 \ + async_training.staleness_threshold="${staleness_threshold}" \ + async_training.trigger_parameter_sync_step="${trigger_parameter_sync_step}" \ + async_training.partial_rollout="${partial_rollout}" diff --git a/recipe/fully_async_policy/dapo_7b_math_fsdp2_8_8.sh b/recipe/fully_async_policy/dapo_7b_math_fsdp2_8_8.sh index c65080ba548..688a87fab92 100644 --- a/recipe/fully_async_policy/dapo_7b_math_fsdp2_8_8.sh +++ b/recipe/fully_async_policy/dapo_7b_math_fsdp2_8_8.sh @@ -75,7 +75,7 @@ n_gpus_training=8 train_prompt_bsz=0 gen_prompt_bsz=1 n_resp_per_prompt=16 -train_prompt_mini_bsz=4 +train_prompt_mini_bsz=64 total_rollout_steps=$(((512*100))) test_freq=10 staleness_threshold=1 @@ -145,6 +145,7 @@ $PYTHON_INTERPRETER -m recipe.fully_async_policy.fully_async_main \ actor_rollout_ref.actor.fsdp_config.fsdp_size=${fsdp_size} \ actor_rollout_ref.rollout.name=${rollout_name} \ actor_rollout_ref.rollout.mode=${rollout_mode} \ + actor_rollout_ref.rollout.calculate_log_probs=True \ reward_model.reward_manager=dapo \ +reward_model.reward_kwargs.overlong_buffer_cfg.enable=${enable_overlong_buffer} \ +reward_model.reward_kwargs.overlong_buffer_cfg.len=${overlong_buffer_len} \ diff --git a/recipe/one_step_off_policy/megatron_workers.py b/recipe/one_step_off_policy/megatron_workers.py index 5b338c5be42..a9318b8f7b3 100644 --- a/recipe/one_step_off_policy/megatron_workers.py +++ b/recipe/one_step_off_policy/megatron_workers.py @@ -168,7 +168,7 @@ def init_model(self): ) log_gpu_memory_usage("After building vllm rollout", logger=logger) - from sharding_manager import DetachShardingManager + from .detach_sharding_manager import DetachShardingManager rollout_sharding_manager = DetachShardingManager( inference_engine=rollout.inference_engine, device_mesh=rollout_device_mesh From d393d5c0daf6e5a489d84127a75a6aeb73872e7d Mon Sep 17 00:00:00 2001 From: wangshulin02 <953550366@qq.com> Date: Wed, 3 Sep 2025 14:10:03 +0800 Subject: [PATCH 096/182] fix final param_sync wait --- recipe/fully_async_policy/fully_async_trainer.py | 1 + 1 file changed, 1 insertion(+) diff --git a/recipe/fully_async_policy/fully_async_trainer.py b/recipe/fully_async_policy/fully_async_trainer.py index 4d800af70a9..9276d148b66 100644 --- a/recipe/fully_async_policy/fully_async_trainer.py +++ b/recipe/fully_async_policy/fully_async_trainer.py @@ -326,6 +326,7 @@ def fit(self): # final parameter sync and validate if val_data is None: self._trigger_parameter_sync_after_step(validate=True, global_steps=self.global_steps-1) + ray.get(self.param_synchronizer.wait_last_sync.remote()) val_data = self.message_queue_client.get_validate_sync() if val_data: val_data: ValidateMetrics = ray.cloudpickle.loads(val_data) From 362c3f95d280922cc4459ecf8941c54e3eb8a5fa Mon Sep 17 00:00:00 2001 From: wangshulin02 <953550366@qq.com> Date: Wed, 3 Sep 2025 15:19:19 +0800 Subject: [PATCH 097/182] free kv cache by calling sleep&wake_up --- recipe/fully_async_policy/fully_async_rollouter.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/recipe/fully_async_policy/fully_async_rollouter.py b/recipe/fully_async_policy/fully_async_rollouter.py index bb7fcfa1889..f3a25c2c30c 100644 --- a/recipe/fully_async_policy/fully_async_rollouter.py +++ b/recipe/fully_async_policy/fully_async_rollouter.py @@ -593,6 +593,8 @@ async def pause(self): await asyncio.gather(*self.active_tasks, return_exceptions=True) self.active_tasks.clear() print("[FullyAsyncRollouter][Public][Pause] All active tasks completed") + self.async_rollout_manager.sleep() + self.async_rollout_manager.wake_up() self.monitor_loop_trigger = False async def resume(self): From 53bfad2c7a995169e7532e3270dd148615dac464 Mon Sep 17 00:00:00 2001 From: hadoop-ai-search Date: Fri, 5 Sep 2025 15:39:15 +0800 Subject: [PATCH 098/182] reset one step --- .../detach_sharding_manager.py | 0 recipe/fully_async_policy/fsdp_workers.py | 268 ++++++++++++ recipe/fully_async_policy/fully_async_main.py | 4 +- recipe/fully_async_policy/megatron_workers.py | 200 +++++++++ recipe/one_step_off_policy/fsdp_workers.py | 84 +--- recipe/one_step_off_policy/main_ppo.py | 91 ++-- .../one_step_off_policy/megatron_workers.py | 89 ++-- recipe/one_step_off_policy/ray_trainer.py | 387 +++++++++++++++--- .../vllm_sharding_manager.py | 74 ++++ tests/special_e2e/run_fully_async_policy.sh | 2 +- 10 files changed, 1019 insertions(+), 180 deletions(-) rename recipe/{one_step_off_policy => fully_async_policy}/detach_sharding_manager.py (100%) create mode 100644 recipe/fully_async_policy/fsdp_workers.py create mode 100644 recipe/fully_async_policy/megatron_workers.py create mode 100644 recipe/one_step_off_policy/vllm_sharding_manager.py diff --git a/recipe/one_step_off_policy/detach_sharding_manager.py b/recipe/fully_async_policy/detach_sharding_manager.py similarity index 100% rename from recipe/one_step_off_policy/detach_sharding_manager.py rename to recipe/fully_async_policy/detach_sharding_manager.py diff --git a/recipe/fully_async_policy/fsdp_workers.py b/recipe/fully_async_policy/fsdp_workers.py new file mode 100644 index 00000000000..086f109e434 --- /dev/null +++ b/recipe/fully_async_policy/fsdp_workers.py @@ -0,0 +1,268 @@ +# Copyright 2025 Bytedance Ltd. and/or its affiliates +# Copyright 2025 Meituan Ltd. and/or its affiliates +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import logging +import os + +import torch +import torch.distributed +from omegaconf import DictConfig, OmegaConf +from torch.distributed.device_mesh import init_device_mesh +from torch.distributed.fsdp import FullyShardedDataParallel as FSDP +from transformers import AutoConfig + +from verl.single_controller.base import Worker +from verl.single_controller.base.decorator import Dispatch, register +from verl.utils import hf_processor, hf_tokenizer, omega_conf_to_dataclass +from verl.utils.debug import DistProfiler, DistProfilerExtension, log_gpu_memory_usage +from verl.utils.device import ( + get_device_name, + get_nccl_backend, + get_torch_device, +) +from verl.utils.fs import copy_to_local +from verl.utils.fsdp_utils import ( + fsdp_version, +) +from verl.utils.import_utils import import_external_libs +from verl.utils.model import get_generation_config, update_model_config +from verl.utils.vllm_utils import patch_vllm_moe_model_weight_loader +from verl.workers.fsdp_workers import ActorRolloutRefWorker, AsyncActorRolloutRefWorker, CriticWorker + +logger = logging.getLogger(__file__) +logger.setLevel(os.getenv("VERL_LOGGING_LEVEL", "WARN")) + +device_name = get_device_name() + +__all__ = ["DetachActorWorker", "DetachRolloutWorker", "DetachAsyncRolloutWorker", "CriticWorker"] + + +def get_inference_model(rollout): + """ + 根据不同类型的inference_engine获取模型对象 + Args: + rollout: rollout对象,包含inference_engine + Returns: + model: 模型对象 + """ + inference_engine = rollout.inference_engine + # 判断inference_engine的类型 + if hasattr(inference_engine, "llm_engine"): + # LLM类型 - vLLMRollout + inference_model = inference_engine.llm_engine.model_executor.driver_worker.worker.model_runner.model + elif hasattr(inference_engine, "worker"): + # WorkerWrapperBase类型 - vLLMAsyncRollout + inference_model = inference_engine.worker.model_runner.model + else: + raise AttributeError( + f"Unsupported inference_engine type: {type(inference_engine)}. " + f"Expected LLM (with llm_engine attribute) or WorkerWrapperBase (with worker attribute)." + ) + return inference_model + + +class DetachNcclSync(ActorRolloutRefWorker): + def _get_actor_params(self): + pass + + @register(dispatch_mode=Dispatch.ONE_TO_ALL, blocking=False) + def sync_rollout_weights(self): + assert (self._is_actor or self._is_rollout) and not self.config.hybrid_engine + assert hasattr(self, "_weights_info") and self._weights_info is not None + + params = self._get_actor_params() if self._is_actor else None + if self._is_rollout: + inference_model = get_inference_model(self.rollout) + patch_vllm_moe_model_weight_loader(inference_model) + for key, shape, dtype in self._weights_info: + tensor = torch.empty(shape, dtype=dtype, device=get_torch_device().current_device()) + if self._is_actor: + assert key in params + origin_data = params[key] + if hasattr(origin_data, "full_tensor"): + origin_data = origin_data.full_tensor() + if torch.distributed.get_rank() == 0: + tensor.copy_(origin_data) + from ray.util.collective import collective + + collective.broadcast(tensor, src_rank=0, group_name="actor_rollout") + if self._is_rollout: + inference_model.load_weights([(key, tensor)]) + + @register(dispatch_mode=Dispatch.ONE_TO_ALL) + def get_actor_weights_info(self): + assert self._is_actor + if hasattr(self, "_weights_info"): + return self._weights_info + if fsdp_version(self.actor_module_fsdp) == 1: + from torch.distributed.fsdp.api import ShardedStateDictConfig, StateDictType + + FSDP.set_state_dict_type( + self.actor_module_fsdp, + state_dict_type=StateDictType.SHARDED_STATE_DICT, + state_dict_config=ShardedStateDictConfig(), + ) + params = self._get_actor_params() + ret = [] + for key, tensor in params.items(): + ret.append((key, tensor.size(), tensor.dtype)) + self._weights_info = ret + return ret + + +class DetachActorWorker(DetachNcclSync): + def _get_actor_params(self): + assert self._is_actor + params = self.actor_module_fsdp.state_dict() + from verl.utils.model import convert_weight_keys + + params = convert_weight_keys( + params, getattr(self.actor_module_fsdp, "_fsdp_wrapped_module", self.actor_module_fsdp) + ) + return params + + +class DetachRolloutWorker(DetachNcclSync): + def __init__(self, config: DictConfig, role: str): + Worker.__init__(self) + assert role == "rollout" + self.config = config + import torch.distributed + + if not torch.distributed.is_initialized(): + rank = int(os.environ.get("RANK", 0)) + world_size = int(os.environ.get("WORLD_SIZE", 1)) + torch.distributed.init_process_group( + backend=f"cpu:gloo,{get_device_name()}:{get_nccl_backend()}", + rank=rank, + world_size=world_size, + init_method=os.environ.get("DIST_INIT_METHOD", None), + ) + # TODO(haibin.lin): + # As of now the type of config is DictConfig, if we assign config.profiler with ProfilerConfig, + # it will actually convert the ProfilerConfig dataclass back to a DictConfig. + # We can still use ProfilerConfig for testing purpose (tests/utils/test_nvtx_profile.py) + # as they provides DictConfig-like interface + # The benefit of creating the dataclass config is to perform validation during __post_init__ + profiler_config = omega_conf_to_dataclass(config.rollout.get("profiler", {})) + DistProfilerExtension.__init__(self, DistProfiler(rank=self.rank, config=profiler_config)) + self._is_rollout = True + self._is_actor = False + + @register(dispatch_mode=Dispatch.ONE_TO_ALL) + def init_model(self): + # This is used to import external_lib into the huggingface systems + import_external_libs(self.config.model.get("external_lib", None)) + override_model_config = OmegaConf.to_container(OmegaConf.create(self.config.model.get("override_config", {}))) + + use_shm = self.config.model.get("use_shm", False) + local_path = copy_to_local(self.config.model.path, use_shm=use_shm) + trust_remote_code = self.config.model.get("trust_remote_code", False) + + self.tokenizer = hf_tokenizer(local_path, trust_remote_code=trust_remote_code) + self.processor = hf_processor(local_path, trust_remote_code=trust_remote_code) + + if self.config.model.get("custom_chat_template", None) is not None: + if self.processor is not None: + self.processor.chat_template = self.config.model.custom_chat_template + else: + self.tokenizer.chat_template = self.config.model.custom_chat_template + + # override model kwargs + actor_model_config = AutoConfig.from_pretrained( + local_path, trust_remote_code=trust_remote_code, attn_implementation="flash_attention_2" + ) + + # patch for kimi-vl + if getattr(actor_model_config, "model_type", None) == "kimi_vl": + actor_model_config.text_config.topk_method = "greedy" + + self.generation_config = get_generation_config(local_path, trust_remote_code=trust_remote_code) + + override_config_kwargs = { + "bos_token_id": self.tokenizer.bos_token_id, + "eos_token_id": self.tokenizer.eos_token_id, + "pad_token_id": self.tokenizer.pad_token_id, + } + override_config_kwargs.update(override_model_config) + update_model_config(actor_model_config, override_config_kwargs=override_config_kwargs) + if self.rank == 0: + print(f"Model config after override: {actor_model_config}") + + infer_tp = self.config.rollout.tensor_model_parallel_size + dp = self.world_size // infer_tp + assert self.world_size % infer_tp == 0, ( + f"rollout world_size: {self.world_size} is not divisible by infer_tp: {infer_tp}" + ) + rollout_device_mesh = init_device_mesh( + device_name, mesh_shape=(dp, infer_tp), mesh_dim_names=["dp", "infer_tp"] + ) + rollout_name = self.config.rollout.name + assert rollout_name == "vllm" + + from verl.workers.rollout.vllm_rollout import vLLMRollout + + log_gpu_memory_usage(f"Before building {rollout_name} rollout", logger=logger) + + from verl.workers.rollout.vllm_rollout import vLLMAsyncRollout + + vllm_rollout_cls = vLLMRollout if self.config.rollout.mode == "sync" else vLLMAsyncRollout + rollout = vllm_rollout_cls( + model_path=local_path, + config=self.config.rollout, + tokenizer=self.tokenizer, + model_hf_config=actor_model_config, + device_mesh=rollout_device_mesh, + trust_remote_code=trust_remote_code, + ) + log_gpu_memory_usage(f"After building {rollout_name} rollout", logger=logger) + + from .detach_sharding_manager import DetachShardingManager + + sharding_manager = DetachShardingManager( + inference_engine=rollout.inference_engine, device_mesh=rollout_device_mesh + ) + + log_gpu_memory_usage("After building sharding manager", logger=logger) + + self.rollout = rollout + self.rollout_sharding_manager = sharding_manager + + @register(dispatch_mode=Dispatch.DP_COMPUTE_PROTO, blocking=False) + def async_generate_sequences(self, *args, **kwargs): + return super().generate_sequences(*args, **kwargs) + + @register(dispatch_mode=Dispatch.ONE_TO_ALL) + def set_actor_weights_info(self, weights_info): + assert self._is_rollout + self._weights_info = weights_info + + +class DetachAsyncRolloutWorker(AsyncActorRolloutRefWorker, DetachRolloutWorker): + def __init__(self, config: DictConfig, role: str): + print(f"[DetachAsyncRolloutWorker] {DetachAsyncRolloutWorker.__mro__}") + DetachRolloutWorker.__init__(self, config, role) + + @register(dispatch_mode=Dispatch.ONE_TO_ALL) + def init_model(self): + print("[DetachAsyncRolloutWorker] init_model") + DetachRolloutWorker.init_model(self) + + self.vllm_tp_size = self.config.rollout.tensor_model_parallel_size + self.vllm_dp_rank = int(os.environ["RANK"]) // self.vllm_tp_size + self.vllm_tp_rank = int(os.environ["RANK"]) % self.vllm_tp_size + + # used for sleep/wake_up + self.rollout.sharding_manager = self.rollout_sharding_manager diff --git a/recipe/fully_async_policy/fully_async_main.py b/recipe/fully_async_policy/fully_async_main.py index 09961c85391..78fc1784b82 100644 --- a/recipe/fully_async_policy/fully_async_main.py +++ b/recipe/fully_async_policy/fully_async_main.py @@ -81,7 +81,7 @@ def create_role_worker_mapping(config): # Select worker class based on strategy if config.actor_rollout_ref.actor.strategy == "fsdp2": assert config.actor_rollout_ref.actor.strategy == config.critic.strategy - from recipe.one_step_off_policy.fsdp_workers import ( + from recipe.fully_async_policy.fsdp_workers import ( CriticWorker, DetachActorWorker, DetachAsyncRolloutWorker, @@ -92,7 +92,7 @@ def create_role_worker_mapping(config): elif config.actor_rollout_ref.actor.strategy == "megatron": assert config.actor_rollout_ref.actor.strategy == config.critic.strategy - from recipe.one_step_off_policy.megatron_workers import ( + from recipe.fully_async_policy.megatron_workers import ( CriticWorker, DetachActorWorker, DetachAsyncRolloutWorker, diff --git a/recipe/fully_async_policy/megatron_workers.py b/recipe/fully_async_policy/megatron_workers.py new file mode 100644 index 00000000000..a9318b8f7b3 --- /dev/null +++ b/recipe/fully_async_policy/megatron_workers.py @@ -0,0 +1,200 @@ +# Copyright 2025 Bytedance Ltd. and/or its affiliates +# Copyright 2025 Meituan Ltd. and/or its affiliates +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import logging +import os + +import torch +import torch.distributed +from omegaconf import DictConfig, OmegaConf + +from verl.single_controller.base.decorator import Dispatch, register +from verl.utils.debug import ( + log_gpu_memory_usage, +) +from verl.utils.device import get_device_name, get_torch_device +from verl.utils.fs import copy_to_local +from verl.utils.vllm_utils import patch_vllm_moe_model_weight_loader +from verl.workers.megatron_workers import ( + ActorRolloutRefWorker, + AsyncActorRolloutRefWorker, + CriticWorker, +) + +logger = logging.getLogger(__file__) +logger.setLevel(os.getenv("VERL_LOGGING_LEVEL", "WARN")) + +__all__ = ["DetachActorWorker", "DetachRolloutWorker", "DetachAsyncRolloutWorker", "CriticWorker"] + + +class DetachNcclSync(ActorRolloutRefWorker): + def _get_actor_params_generator(self): + pass + + @register(dispatch_mode=Dispatch.ONE_TO_ALL, blocking=False) + def sync_rollout_weights(self): + assert (self._is_actor or self._is_rollout) and not self.config.hybrid_engine + assert hasattr(self, "_weights_info") and self._weights_info is not None + + params_generator = self._get_actor_params_generator() if self._is_actor else None + if self._is_rollout: + inference_model = ( + self.rollout.inference_engine.llm_engine.model_executor.driver_worker.worker.model_runner.model + ) + patch_vllm_moe_model_weight_loader(inference_model) + for key, shape, dtype in self._weights_info: + if self._is_actor: + weight_key, weight = next(params_generator) + assert key == weight_key + assert shape == weight.size() + assert dtype == weight.dtype + + tensor = torch.empty(shape, dtype=dtype, device=get_torch_device().current_device()) + if self._is_actor and torch.distributed.get_rank() == 0: + tensor.copy_(weight) + from ray.util.collective import collective + + collective.broadcast(tensor, src_rank=0, group_name="actor_rollout") + if self._is_rollout: + inference_model.load_weights([(key, tensor)]) + + @register(dispatch_mode=Dispatch.ONE_TO_ALL) + def get_actor_weights_info(self): + assert self._is_actor + if hasattr(self, "_weights_info"): + return self._weights_info + + params_generator = self._get_actor_params_generator() + ret = [] + for key, tensor in params_generator: + ret.append((key, tensor.size(), tensor.dtype)) + + self._weights_info = ret + return ret + + +class DetachActorWorker(DetachNcclSync): + def _get_actor_params_generator(self): + assert self._is_actor + from verl.models.mcore import get_mcore_weight_converter + from verl.utils.megatron_utils import per_tensor_generator + + layer_name_mapping = { + "qkv_layer_name": "self_attention.linear_qkv.", + "gate_proj_layer_name": "linear_fc1.", + } + weight_converter = get_mcore_weight_converter(self.actor_model_config, self.dtype) + generator = per_tensor_generator( + self.actor.actor_module, + self.actor_model_config, + weight_converter, + self.tf_config, + layer_name_mapping, + ) + return generator + + +class DetachRolloutWorker(DetachNcclSync): + @register(dispatch_mode=Dispatch.ONE_TO_ALL) + def init_model(self): + if self.config.model.get("external_lib", None) is not None: + # This is used to import external_lib into the huggingface systems + import importlib + + importlib.import_module(self.config.model.external_lib) + + from verl.utils.torch_dtypes import PrecisionType + + override_model_config = OmegaConf.to_container(OmegaConf.create(self.config.model.get("override_config", {}))) + override_transformer_config = {} + self.param_dtype = torch.bfloat16 + self.dtype = PrecisionType.to_dtype(self.param_dtype) + trust_remote_code = self.config.model.get("trust_remote_code", False) + + from verl.utils.model import get_generation_config + + self._init_hf_config_and_tf_config( + self.config.model.path, + self.config.model.path, + self.dtype, + override_model_config, + override_transformer_config, + trust_remote_code, + ) + self.generation_config = get_generation_config(self.local_path) + + from torch.distributed.device_mesh import init_device_mesh + + assert self.config.rollout.name == "vllm" + + from verl.workers.rollout.vllm_rollout import vLLMRollout + + # NOTE(sgm): If the QKV and gate_up projection layer are concate together in actor, + # we will reorganize their weight format when resharding from actor to rollout. + + infer_tp = self.config.rollout.tensor_model_parallel_size + dp = self.world_size // infer_tp + assert self.world_size % infer_tp == 0, ( + f"rollout world_size: {self.world_size} is not divisible by infer_tp: {infer_tp}" + ) + rollout_device_mesh = init_device_mesh( + get_device_name(), mesh_shape=(dp, infer_tp), mesh_dim_names=["dp", "infer_tp"] + ) + log_gpu_memory_usage("Before building vllm rollout", logger=None) + + local_path = copy_to_local(self.config.model.path, use_shm=self.config.model.get("use_shm", False)) + from verl.workers.rollout.vllm_rollout import vLLMAsyncRollout + + vllm_rollout_cls = vLLMRollout if self.config.rollout.mode == "sync" else vLLMAsyncRollout + rollout = vllm_rollout_cls( + model_path=local_path, + config=self.config.rollout, + tokenizer=self.tokenizer, + model_hf_config=self.hf_config, + device_mesh=rollout_device_mesh, + trust_remote_code=trust_remote_code, + ) + log_gpu_memory_usage("After building vllm rollout", logger=logger) + + from .detach_sharding_manager import DetachShardingManager + + rollout_sharding_manager = DetachShardingManager( + inference_engine=rollout.inference_engine, device_mesh=rollout_device_mesh + ) + + log_gpu_memory_usage("After building sharding manager", logger=logger) + + self.rollout = rollout + self.sharding_manager = rollout_sharding_manager + self.rollout.sharding_manager = rollout_sharding_manager + + @register(dispatch_mode=Dispatch.DP_COMPUTE_PROTO, blocking=False) + def async_generate_sequences(self, *args, **kwargs): + return super().generate_sequences(*args, **kwargs) + + @register(dispatch_mode=Dispatch.ONE_TO_ALL) + def set_actor_weights_info(self, weights_info): + assert self._is_rollout + self._weights_info = weights_info + + +class DetachAsyncRolloutWorker(AsyncActorRolloutRefWorker, DetachRolloutWorker): + def __init__(self, config: DictConfig, role: str): + print(DetachAsyncRolloutWorker.__mro__) + DetachRolloutWorker.__init__(self, config, role) + + @register(dispatch_mode=Dispatch.ONE_TO_ALL) + def init_model(self): + DetachRolloutWorker.init_model(self) diff --git a/recipe/one_step_off_policy/fsdp_workers.py b/recipe/one_step_off_policy/fsdp_workers.py index 086f109e434..0aa21991708 100644 --- a/recipe/one_step_off_policy/fsdp_workers.py +++ b/recipe/one_step_off_policy/fsdp_workers.py @@ -39,43 +39,27 @@ from verl.utils.import_utils import import_external_libs from verl.utils.model import get_generation_config, update_model_config from verl.utils.vllm_utils import patch_vllm_moe_model_weight_loader -from verl.workers.fsdp_workers import ActorRolloutRefWorker, AsyncActorRolloutRefWorker, CriticWorker +from verl.workers.fsdp_workers import ActorRolloutRefWorker as ARRWorker +from verl.workers.fsdp_workers import CriticWorker logger = logging.getLogger(__file__) logger.setLevel(os.getenv("VERL_LOGGING_LEVEL", "WARN")) device_name = get_device_name() -__all__ = ["DetachActorWorker", "DetachRolloutWorker", "DetachAsyncRolloutWorker", "CriticWorker"] - - -def get_inference_model(rollout): - """ - 根据不同类型的inference_engine获取模型对象 - Args: - rollout: rollout对象,包含inference_engine - Returns: - model: 模型对象 - """ - inference_engine = rollout.inference_engine - # 判断inference_engine的类型 - if hasattr(inference_engine, "llm_engine"): - # LLM类型 - vLLMRollout - inference_model = inference_engine.llm_engine.model_executor.driver_worker.worker.model_runner.model - elif hasattr(inference_engine, "worker"): - # WorkerWrapperBase类型 - vLLMAsyncRollout - inference_model = inference_engine.worker.model_runner.model - else: - raise AttributeError( - f"Unsupported inference_engine type: {type(inference_engine)}. " - f"Expected LLM (with llm_engine attribute) or WorkerWrapperBase (with worker attribute)." - ) - return inference_model +__all__ = ["ActorRolloutRefWorker", "AsyncActorRolloutRefWorker", "CriticWorker", "RolloutWorker"] -class DetachNcclSync(ActorRolloutRefWorker): +class ActorRolloutRefWorker(ARRWorker): def _get_actor_params(self): - pass + assert self._is_actor + params = self.actor_module_fsdp.state_dict() + from verl.utils.model import convert_weight_keys + + params = convert_weight_keys( + params, getattr(self.actor_module_fsdp, "_fsdp_wrapped_module", self.actor_module_fsdp) + ) + return params @register(dispatch_mode=Dispatch.ONE_TO_ALL, blocking=False) def sync_rollout_weights(self): @@ -84,7 +68,9 @@ def sync_rollout_weights(self): params = self._get_actor_params() if self._is_actor else None if self._is_rollout: - inference_model = get_inference_model(self.rollout) + inference_model = ( + self.rollout.inference_engine.llm_engine.model_executor.driver_worker.worker.model_runner.model + ) patch_vllm_moe_model_weight_loader(inference_model) for key, shape, dtype in self._weights_info: tensor = torch.empty(shape, dtype=dtype, device=get_torch_device().current_device()) @@ -122,19 +108,7 @@ def get_actor_weights_info(self): return ret -class DetachActorWorker(DetachNcclSync): - def _get_actor_params(self): - assert self._is_actor - params = self.actor_module_fsdp.state_dict() - from verl.utils.model import convert_weight_keys - - params = convert_weight_keys( - params, getattr(self.actor_module_fsdp, "_fsdp_wrapped_module", self.actor_module_fsdp) - ) - return params - - -class DetachRolloutWorker(DetachNcclSync): +class RolloutWorker(ActorRolloutRefWorker): def __init__(self, config: DictConfig, role: str): Worker.__init__(self) assert role == "rollout" @@ -228,17 +202,16 @@ def init_model(self): trust_remote_code=trust_remote_code, ) log_gpu_memory_usage(f"After building {rollout_name} rollout", logger=logger) + from .vllm_sharding_manager import VLLMShardingManager - from .detach_sharding_manager import DetachShardingManager - - sharding_manager = DetachShardingManager( + rollout_sharding_manager = VLLMShardingManager( inference_engine=rollout.inference_engine, device_mesh=rollout_device_mesh ) log_gpu_memory_usage("After building sharding manager", logger=logger) self.rollout = rollout - self.rollout_sharding_manager = sharding_manager + self.rollout_sharding_manager = rollout_sharding_manager @register(dispatch_mode=Dispatch.DP_COMPUTE_PROTO, blocking=False) def async_generate_sequences(self, *args, **kwargs): @@ -250,19 +223,6 @@ def set_actor_weights_info(self, weights_info): self._weights_info = weights_info -class DetachAsyncRolloutWorker(AsyncActorRolloutRefWorker, DetachRolloutWorker): - def __init__(self, config: DictConfig, role: str): - print(f"[DetachAsyncRolloutWorker] {DetachAsyncRolloutWorker.__mro__}") - DetachRolloutWorker.__init__(self, config, role) - - @register(dispatch_mode=Dispatch.ONE_TO_ALL) - def init_model(self): - print("[DetachAsyncRolloutWorker] init_model") - DetachRolloutWorker.init_model(self) - - self.vllm_tp_size = self.config.rollout.tensor_model_parallel_size - self.vllm_dp_rank = int(os.environ["RANK"]) // self.vllm_tp_size - self.vllm_tp_rank = int(os.environ["RANK"]) % self.vllm_tp_size - - # used for sleep/wake_up - self.rollout.sharding_manager = self.rollout_sharding_manager +class AsyncActorRolloutRefWorker(ActorRolloutRefWorker): + def __init__(self, *args, **kwargs): + raise NotImplementedError diff --git a/recipe/one_step_off_policy/main_ppo.py b/recipe/one_step_off_policy/main_ppo.py index 0dcdbef3705..44a0f4b8675 100644 --- a/recipe/one_step_off_policy/main_ppo.py +++ b/recipe/one_step_off_policy/main_ppo.py @@ -23,18 +23,58 @@ import ray from omegaconf import OmegaConf +from verl.trainer.constants_ppo import get_ppo_ray_runtime_env from verl.trainer.main_ppo import create_rl_dataset, create_rl_sampler from verl.trainer.ppo.reward import load_reward_manager from .ray_trainer import OneStepOffRayTrainer +@hydra.main(config_path="config", config_name="one_step_off_ppo_trainer", version_base=None) +def main(config): + run_ppo(config) + + +# Define a function to run the PPO-like training process +def run_ppo(config) -> None: + # Check if Ray is not initialized + if not ray.is_initialized(): + # Initialize Ray with a local cluster configuration + # Set environment variables in the runtime environment to control tokenizer parallelism, + # NCCL debug level, VLLM logging level, and allow runtime LoRA updating + # `num_cpus` specifies the number of CPU cores Ray can use, obtained from the configuration + ray.init( + runtime_env=get_ppo_ray_runtime_env(), + num_cpus=config.ray_init.num_cpus, + ) + + # Create a remote instance of the TaskRunner class, and + # Execute the `run` method of the TaskRunner instance remotely and wait for it to complete + if ( + OmegaConf.select(config.trainer, "profile_steps") is not None + and len(OmegaConf.select(config.trainer, "profile_steps")) > 0 + ): + nsight_options = OmegaConf.to_container(config.trainer.controller_nsight_options) + runner = TaskRunner.options(runtime_env={"nsight": nsight_options}).remote() + else: + runner = TaskRunner.remote() + ray.get(runner.run.remote(config)) + + # [Optional] get the path of the timeline trace file from the configuration, default to None + # This file is used for performance analysis + timeline_json_file = config.ray_init.get("timeline_json_file", None) + if timeline_json_file: + ray.timeline(filename=timeline_json_file) + + @ray.remote(num_cpus=1) # please make sure main_task is not scheduled on head -class OneStepOffTaskRunner: +class TaskRunner: def run(self, config): # Print the initial configuration. `resolve=True` will evaluate symbolic values. from pprint import pprint + from omegaconf import OmegaConf + from verl.utils.fs import copy_to_local print(f"TaskRunner hostname: {socket.gethostname()}, PID: {os.getpid()}") @@ -60,26 +100,38 @@ def run(self, config): # Define worker classes based on the actor strategy. if config.actor_rollout_ref.actor.strategy == "fsdp2": assert config.actor_rollout_ref.actor.strategy == config.critic.strategy - from recipe.one_step_off_policy.fsdp_workers import ( + from verl.single_controller.ray import RayWorkerGroup + + from .fsdp_workers import ( + ActorRolloutRefWorker, + AsyncActorRolloutRefWorker, CriticWorker, - DetachActorWorker, - DetachAsyncRolloutWorker, - DetachRolloutWorker, + RolloutWorker, ) - from verl.single_controller.ray import RayWorkerGroup + actor_rollout_cls = ( + AsyncActorRolloutRefWorker + if config.actor_rollout_ref.rollout.mode == "async" + else ActorRolloutRefWorker + ) ray_worker_group_cls = RayWorkerGroup elif config.actor_rollout_ref.actor.strategy == "megatron": assert config.actor_rollout_ref.actor.strategy == config.critic.strategy - from recipe.one_step_off_policy.megatron_workers import ( + from verl.single_controller.ray.megatron import NVMegatronRayWorkerGroup + + from .megatron_workers import ( + ActorRolloutRefWorker, + AsyncActorRolloutRefWorker, CriticWorker, - DetachActorWorker, - DetachAsyncRolloutWorker, - DetachRolloutWorker, + RolloutWorker, ) - from verl.single_controller.ray.megatron import NVMegatronRayWorkerGroup + actor_rollout_cls = ( + AsyncActorRolloutRefWorker + if config.actor_rollout_ref.rollout.mode == "async" + else ActorRolloutRefWorker + ) ray_worker_group_cls = NVMegatronRayWorkerGroup else: @@ -88,10 +140,8 @@ def run(self, config): from .ray_trainer import ResourcePoolManager, Role role_worker_mapping = { - Role.Actor: ray.remote(DetachActorWorker), - Role.Rollout: ray.remote( - DetachAsyncRolloutWorker if config.actor_rollout_ref.rollout.mode == "async" else DetachRolloutWorker - ), + Role.Actor: ray.remote(actor_rollout_cls), + Role.Rollout: ray.remote(RolloutWorker), Role.Critic: ray.remote(CriticWorker), } @@ -122,7 +172,7 @@ def run(self, config): # finally, we combine all the rewards together # The reward type depends on the tag of the data if config.reward_model.enable: - if config.reward_model.strategy == "fsdp2": + if config.reward_model.strategy in ["fsdp2"]: from verl.workers.fsdp_workers import RewardModelWorker elif config.reward_model.strategy == "megatron": from verl.workers.megatron_workers import RewardModelWorker @@ -133,7 +183,7 @@ def run(self, config): # Add a reference policy worker if KL loss or KL reward is used. if config.algorithm.use_kl_in_reward or config.actor_rollout_ref.actor.use_kl_loss: - role_worker_mapping[Role.RefPolicy] = ray.remote(DetachActorWorker) + role_worker_mapping[Role.RefPolicy] = ray.remote(ActorRolloutRefWorker) mapping[Role.RefPolicy] = global_pool_id # Load the reward manager for training and validation. @@ -174,12 +224,5 @@ def run(self, config): trainer.fit() -@hydra.main(config_path="config", config_name="one_step_off_ppo_trainer", version_base=None) -def main(config): - from verl.trainer.main_ppo import run_ppo - - run_ppo(config, OneStepOffTaskRunner) - - if __name__ == "__main__": main() diff --git a/recipe/one_step_off_policy/megatron_workers.py b/recipe/one_step_off_policy/megatron_workers.py index a9318b8f7b3..f7b58405b4f 100644 --- a/recipe/one_step_off_policy/megatron_workers.py +++ b/recipe/one_step_off_policy/megatron_workers.py @@ -27,21 +27,42 @@ from verl.utils.device import get_device_name, get_torch_device from verl.utils.fs import copy_to_local from verl.utils.vllm_utils import patch_vllm_moe_model_weight_loader -from verl.workers.megatron_workers import ( - ActorRolloutRefWorker, - AsyncActorRolloutRefWorker, - CriticWorker, -) +from verl.workers.megatron_workers import ActorRolloutRefWorker as ARRWorker +from verl.workers.megatron_workers import CriticWorker, RewardModelWorker logger = logging.getLogger(__file__) logger.setLevel(os.getenv("VERL_LOGGING_LEVEL", "WARN")) -__all__ = ["DetachActorWorker", "DetachRolloutWorker", "DetachAsyncRolloutWorker", "CriticWorker"] +__all__ = ["ActorRolloutRefWorker", "AsyncActorRolloutRefWorker", "CriticWorker", "RewardModelWorker", "RolloutWorker"] + +class ActorRolloutRefWorker(ARRWorker): + def __init__(self, config: DictConfig, role: str): + assert role in ["actor", "ref"] + tmp_role = "ref" if role == "ref" else "actor_rollout" + super().__init__(config, tmp_role) + if role == "actor": + self._is_rollout = False + self.role = role -class DetachNcclSync(ActorRolloutRefWorker): def _get_actor_params_generator(self): - pass + assert self._is_actor + from verl.models.mcore import get_mcore_weight_converter + from verl.utils.megatron_utils import per_tensor_generator + + layer_name_mapping = { + "qkv_layer_name": "self_attention.linear_qkv.", + "gate_proj_layer_name": "linear_fc1.", + } + weight_converter = get_mcore_weight_converter(self.actor_model_config, self.dtype) + generator = per_tensor_generator( + self.actor.actor_module, + self.actor_model_config, + weight_converter, + self.tf_config, + layer_name_mapping, + ) + return generator @register(dispatch_mode=Dispatch.ONE_TO_ALL, blocking=False) def sync_rollout_weights(self): @@ -85,28 +106,11 @@ def get_actor_weights_info(self): return ret -class DetachActorWorker(DetachNcclSync): - def _get_actor_params_generator(self): - assert self._is_actor - from verl.models.mcore import get_mcore_weight_converter - from verl.utils.megatron_utils import per_tensor_generator - - layer_name_mapping = { - "qkv_layer_name": "self_attention.linear_qkv.", - "gate_proj_layer_name": "linear_fc1.", - } - weight_converter = get_mcore_weight_converter(self.actor_model_config, self.dtype) - generator = per_tensor_generator( - self.actor.actor_module, - self.actor_model_config, - weight_converter, - self.tf_config, - layer_name_mapping, - ) - return generator - +class RolloutWorker(ActorRolloutRefWorker): + def __init__(self, config: DictConfig, role: str): + assert role == "rollout" + ARRWorker.__init__(self, config, role) -class DetachRolloutWorker(DetachNcclSync): @register(dispatch_mode=Dispatch.ONE_TO_ALL) def init_model(self): if self.config.model.get("external_lib", None) is not None: @@ -138,9 +142,12 @@ def init_model(self): from torch.distributed.device_mesh import init_device_mesh assert self.config.rollout.name == "vllm" + assert self.config.rollout.mode == "sync" from verl.workers.rollout.vllm_rollout import vLLMRollout + from .vllm_sharding_manager import VLLMShardingManager + # NOTE(sgm): If the QKV and gate_up projection layer are concate together in actor, # we will reorganize their weight format when resharding from actor to rollout. @@ -168,17 +175,14 @@ def init_model(self): ) log_gpu_memory_usage("After building vllm rollout", logger=logger) - from .detach_sharding_manager import DetachShardingManager - - rollout_sharding_manager = DetachShardingManager( - inference_engine=rollout.inference_engine, device_mesh=rollout_device_mesh + sharding_manager = VLLMShardingManager( + inference_engine=rollout.inference_engine, + device_mesh=rollout_device_mesh, ) - log_gpu_memory_usage("After building sharding manager", logger=logger) - self.rollout = rollout - self.sharding_manager = rollout_sharding_manager - self.rollout.sharding_manager = rollout_sharding_manager + self.rollout, self.sharding_manager = rollout, sharding_manager + self.rollout.sharding_manager = sharding_manager @register(dispatch_mode=Dispatch.DP_COMPUTE_PROTO, blocking=False) def async_generate_sequences(self, *args, **kwargs): @@ -190,11 +194,6 @@ def set_actor_weights_info(self, weights_info): self._weights_info = weights_info -class DetachAsyncRolloutWorker(AsyncActorRolloutRefWorker, DetachRolloutWorker): - def __init__(self, config: DictConfig, role: str): - print(DetachAsyncRolloutWorker.__mro__) - DetachRolloutWorker.__init__(self, config, role) - - @register(dispatch_mode=Dispatch.ONE_TO_ALL) - def init_model(self): - DetachRolloutWorker.init_model(self) +class AsyncActorRolloutRefWorker(ActorRolloutRefWorker): + def __init__(self, *args, **kwargs): + raise NotImplementedError diff --git a/recipe/one_step_off_policy/ray_trainer.py b/recipe/one_step_off_policy/ray_trainer.py index ef8d6d8792e..1f7011bdf54 100644 --- a/recipe/one_step_off_policy/ray_trainer.py +++ b/recipe/one_step_off_policy/ray_trainer.py @@ -18,24 +18,40 @@ This trainer supports model-agonistic model initialization with huggingface """ -import warnings +import uuid from pprint import pprint +import numpy as np import ray +import torch from omegaconf import OmegaConf from torch.utils.data import Dataset, Sampler from tqdm import tqdm +from verl import DataProto from verl.single_controller.ray import RayClassWithInitArgs, RayWorkerGroup +from verl.single_controller.ray.base import create_colocated_worker_cls from verl.trainer.ppo import core_algos -from verl.trainer.ppo.core_algos import AdvantageEstimator +from verl.trainer.ppo.core_algos import AdvantageEstimator, agg_loss +from verl.trainer.ppo.metric_utils import ( + compute_data_metrics, + compute_throughout_metrics, + compute_timing_metrics, +) from verl.trainer.ppo.ray_trainer import ( RayPPOTrainer, ResourcePoolManager, Role, WorkerType, + apply_kl_penalty, + compute_advantage, + compute_response_mask, ) +from verl.trainer.ppo.reward import compute_reward, compute_reward_async from verl.utils.debug import marked_timer +from verl.utils.metric import ( + reduce_metrics, +) from verl.utils.tracking import ValidationGenerationsLogger @@ -89,7 +105,7 @@ def __init__( val_dataset: Dataset | None = None, collate_fn=None, train_sampler: Sampler | None = None, - device_name=None, + device_name="cuda", ): """ Initialize distributed PPO trainer with Ray backend. @@ -127,31 +143,32 @@ def __init__( self.use_reference_policy = Role.RefPolicy in role_worker_mapping self.use_rm = Role.RewardModel in role_worker_mapping self.ray_worker_group_cls = ray_worker_group_cls - self.device_name = device_name if device_name else self.config.trainer.device - self.validation_generations_logger = ValidationGenerationsLogger( - project_name=self.config.trainer.project_name, - experiment_name=self.config.trainer.experiment_name, - ) + self.device_name = device_name + self.validation_generations_logger = ValidationGenerationsLogger() # if ref_in_actor is True, the reference policy will be actor without lora applied self.ref_in_actor = config.actor_rollout_ref.model.get("lora_rank", 0) > 0 # define in-reward KL control # kl loss control currently not suppoorted - if self.config.algorithm.use_kl_in_reward: - self.kl_ctrl_in_reward = core_algos.get_kl_controller(self.config.algorithm.kl_ctrl) + if config.algorithm.use_kl_in_reward: + self.kl_ctrl_in_reward = core_algos.get_kl_controller(config.algorithm.kl_ctrl) - if config.critic.enable is not None: - self.use_critic = bool(config.critic.enable) - elif self.config.algorithm.adv_estimator == AdvantageEstimator.GAE: + if self.config.algorithm.adv_estimator == AdvantageEstimator.GAE: self.use_critic = True - else: - warnings.warn( - "Disabled critic as algorithm.adv_estimator != gae. " - "If it is not intended, please set critic.enable=True", - stacklevel=2, - ) + elif self.config.algorithm.adv_estimator in [ + AdvantageEstimator.GRPO, + AdvantageEstimator.GRPO_PASSK, + AdvantageEstimator.REINFORCE_PLUS_PLUS, + # AdvantageEstimator.REMAX, # TODO:REMAX advantage estimator is not yet supported in one_step_off_policy + AdvantageEstimator.RLOO, + AdvantageEstimator.OPO, + AdvantageEstimator.REINFORCE_PLUS_PLUS_BASELINE, + AdvantageEstimator.GPG, + ]: self.use_critic = False + else: + raise NotImplementedError self._validate_config() self._create_dataloader(train_dataset, val_dataset, collate_fn, train_sampler) @@ -162,32 +179,94 @@ def _validate(self): self.actor_rollout_wg = self.actor_wg return ret - def _create_actor_rollout_classes(self): + def init_workers(self): + """Initialize distributed training workers using Ray backend. + + Creates: + 1. Ray resource pools from configuration + 2. Worker groups for each role (actor, critic, etc.) + """ + self.resource_pool_manager.create_resource_pool() + + self.resource_pool_to_cls = {pool: {} for pool in self.resource_pool_manager.resource_pool_dict.values()} + # create actor and rollout - for role in [Role.Actor, Role.Rollout]: + for role, role_name in [(Role.Actor, "actor"), (Role.Rollout, "rollout")]: resource_pool = self.resource_pool_manager.get_resource_pool(role) role_cls = RayClassWithInitArgs( cls=self.role_worker_mapping[role], config=self.config.actor_rollout_ref, - role=str(role), + role=role_name, + ) + self.resource_pool_to_cls[resource_pool][role_name] = role_cls + + # create critic + if self.use_critic: + resource_pool = self.resource_pool_manager.get_resource_pool(Role.Critic) + critic_cls = RayClassWithInitArgs(cls=self.role_worker_mapping[Role.Critic], config=self.config.critic) + self.resource_pool_to_cls[resource_pool]["critic"] = critic_cls + + # create reference policy if needed + if self.use_reference_policy: + resource_pool = self.resource_pool_manager.get_resource_pool(Role.RefPolicy) + ref_policy_cls = RayClassWithInitArgs( + self.role_worker_mapping[Role.RefPolicy], + config=self.config.actor_rollout_ref, + role="ref", + profile_option=self.config.trainer.npu_profile.options, + ) + self.resource_pool_to_cls[resource_pool]["ref"] = ref_policy_cls + + # create a reward model if reward_fn is None + if self.use_rm: + # we create a RM here + resource_pool = self.resource_pool_manager.get_resource_pool(Role.RewardModel) + rm_cls = RayClassWithInitArgs(self.role_worker_mapping[Role.RewardModel], config=self.config.reward_model) + self.resource_pool_to_cls[resource_pool]["rm"] = rm_cls + + # initialize WorkerGroup + # NOTE: if you want to use a different resource pool for each role, which can support different parallel size, + # you should not use `create_colocated_worker_cls`. + # Instead, directly pass different resource pool to different worker groups. + # See https://github.com/volcengine/verl/blob/master/examples/ray/tutorial.ipynb for more information. + all_wg = {} + wg_kwargs = {} # Setting up kwargs for RayWorkerGroup + if OmegaConf.select(self.config.trainer, "ray_wait_register_center_timeout") is not None: + wg_kwargs["ray_wait_register_center_timeout"] = self.config.trainer.ray_wait_register_center_timeout + if OmegaConf.select(self.config.trainer, "profile_steps") is not None: + wg_kwargs["profile_steps"] = OmegaConf.select(self.config.trainer, "profile_steps") + assert OmegaConf.select(self.config.trainer, "worker_nsight_options") is not None, ( + "worker_nsight_options must be set when profile_steps is set" + ) + wg_kwargs["worker_nsight_options"] = OmegaConf.to_container( + OmegaConf.select(self.config.trainer, "worker_nsight_options") + ) + + for resource_pool, class_dict in self.resource_pool_to_cls.items(): + worker_dict_cls = create_colocated_worker_cls(class_dict=class_dict) + wg_dict = self.ray_worker_group_cls( + resource_pool=resource_pool, + ray_cls_with_init=worker_dict_cls, + device_name=self.device_name, + **wg_kwargs, ) - self.resource_pool_to_cls[resource_pool][str(role)] = role_cls + spawn_wg = wg_dict.spawn(prefix_set=class_dict.keys()) + all_wg.update(spawn_wg) - def _init_models(self): if self.use_critic: - self.critic_wg = self.all_wg[str(Role.Critic)] + self.critic_wg = all_wg["critic"] self.critic_wg.init_model() if self.use_reference_policy and not self.ref_in_actor: - self.ref_policy_wg = self.all_wg[str(Role.RefPolicy)] + self.ref_policy_wg = all_wg["ref"] self.ref_policy_wg.init_model() if self.use_rm: - self.rm_wg = self.all_wg[str(Role.RewardModel)] + self.rm_wg = all_wg["rm"] self.rm_wg.init_model() - self.actor_wg = self.all_wg[str(Role.Actor)] - self.rollout_wg = self.all_wg[str(Role.Rollout)] + self.actor_wg = all_wg["actor"] + self.rollout_wg = all_wg["rollout"] self.actor_wg.init_model() self.rollout_wg.init_model() self.actor_rollout_wg = self.actor_wg # to be compatible with the functions that not be modified @@ -205,9 +284,21 @@ def _init_models(self): ) self.sync_rollout_weights() + # create async rollout manager and request scheduler + self.async_rollout_mode = False + if self.config.actor_rollout_ref.rollout.mode == "async" and self._is_rollout: + from verl.workers.rollout.async_server import AsyncLLMServerManager + + self.async_rollout_mode = True + self.async_rollout_manager = AsyncLLMServerManager( + config=self.config, + worker_group=self.rollout_wg, + ) + def sync_rollout_weights(self): - self.actor_wg.sync_rollout_weights() - ray.get(self.rollout_wg.sync_rollout_weights()) + if not self.hybrid_engine: + self.actor_wg.sync_rollout_weights() + ray.get(self.rollout_wg.sync_rollout_weights()) def _create_continuous_iterator(self): """ @@ -229,7 +320,23 @@ def _async_gen_next_batch(self, continuous_iterator): except Exception as e: print(f"Error in async_gen_next_batch: {e}") return None - batch, gen_batch = self._prepare_generate_batch(batch_dict) + batch = DataProto.from_single_dict(batch_dict) + # pop those keys for generation + batch_keys_to_pop = ["input_ids", "attention_mask", "position_ids"] + non_tensor_batch_keys_to_pop = ["raw_prompt_ids"] + if "multi_modal_data" in batch.non_tensor_batch: + non_tensor_batch_keys_to_pop.append("multi_modal_data") + if "raw_prompt" in batch.non_tensor_batch: + non_tensor_batch_keys_to_pop.append("raw_prompt") + if "tools_kwargs" in batch.non_tensor_batch: + non_tensor_batch_keys_to_pop.append("tools_kwargs") + if "interaction_kwargs" in batch.non_tensor_batch: + non_tensor_batch_keys_to_pop.append("interaction_kwargs") + gen_batch = batch.pop( + batch_keys=batch_keys_to_pop, + non_tensor_batch_keys=non_tensor_batch_keys_to_pop, + ) + gen_batch = gen_batch.repeat(repeat_times=self.config.actor_rollout_ref.rollout.n, interleave=True) # sync weights from actor to rollout self.sync_rollout_weights() # async generation @@ -243,6 +350,7 @@ def fit(self): to construct the PPO dataflow. The light-weight advantage computation is done on the driver process. """ + from omegaconf import OmegaConf from verl.utils.tracking import Tracking @@ -274,7 +382,6 @@ def fit(self): # we start from step 1 self.global_steps += 1 last_val_metrics = None - self.max_steps_duration = 0 # across epoch iterator continuous_iterator = self._create_continuous_iterator() @@ -283,16 +390,24 @@ def fit(self): batch_data_future = self._async_gen_next_batch(continuous_iterator) while batch_data_future is not None: - metrics = {} - timing_raw = {} - do_profile = ( self.global_steps in self.config.trainer.profile_steps if self.config.trainer.profile_steps is not None else False ) - self._start_profiling(do_profile, timing_raw) + if do_profile: + self.actor_wg.start_profile() + if not self.hybrid_engine: + self.rollout_wg.start_profile() + if self.use_reference_policy: + self.ref_policy_wg.start_profile() + if self.use_critic: + self.critic_wg.start_profile() + if self.use_rm: + self.rm_wg.start_profile() + metrics = {} + timing_raw = {} is_last_step = self.global_steps >= self.total_training_steps with marked_timer("step", timing_raw): @@ -307,15 +422,184 @@ def fit(self): if not is_last_step: batch_data_future = self._async_gen_next_batch(continuous_iterator) - batch = self._post_generate_batch(batch, gen_batch_output, metrics) - batch, reward_extra_infos_dict = self._process_batch_common(batch, metrics, timing_raw) - self._log_rollout(batch, reward_extra_infos_dict, timing_raw) - last_val_metrics = self._validate_metrics(is_last_step, last_val_metrics, metrics, timing_raw) - self._check_save_checkpoint(is_last_step, timing_raw) - - self._stop_profiling(do_profile, timing_raw) - self._collect_metrics(batch, epoch, metrics, timing_raw) - self._post_batch_processing(batch) + batch.non_tensor_batch["uid"] = np.array( + [str(uuid.uuid4()) for _ in range(len(batch.batch))], dtype=object + ) + # repeat to align with repeated responses in rollout + batch = batch.repeat(repeat_times=self.config.actor_rollout_ref.rollout.n, interleave=True) + batch = batch.union(gen_batch_output) + + batch.batch["response_mask"] = compute_response_mask(batch) + # Balance the number of valid tokens across DP ranks. + # NOTE: This usually changes the order of data in the `batch`, + # which won't affect the advantage calculation (since it's based on uid), + # but might affect the loss calculation (due to the change of mini-batching). + # TODO: Decouple the DP balancing and mini-batching. + if self.config.trainer.balance_batch: + self._balance_batch(batch, metrics=metrics) + + # compute global_valid tokens + batch.meta_info["global_token_num"] = torch.sum(batch.batch["attention_mask"], dim=-1).tolist() + + with marked_timer("reward", timing_raw, color="yellow"): + # compute reward model score + if self.use_rm: + reward_tensor = self.rm_wg.compute_rm_score(batch) + batch = batch.union(reward_tensor) + + if self.config.reward_model.launch_reward_fn_async: + future_reward = compute_reward_async.remote(batch, self.config, self.tokenizer) + else: + reward_tensor, reward_extra_infos_dict = compute_reward(batch, self.reward_fn) + + # recompute old_log_probs + with marked_timer("old_log_prob", timing_raw, color="blue"): + old_log_prob = self.actor_wg.compute_log_prob(batch) + entropys = old_log_prob.batch["entropys"] + response_masks = batch.batch["response_mask"] + loss_agg_mode = self.config.actor_rollout_ref.actor.loss_agg_mode + entropy_agg = agg_loss(loss_mat=entropys, loss_mask=response_masks, loss_agg_mode=loss_agg_mode) + old_log_prob_metrics = {"actor/entropy": entropy_agg.detach().item()} + metrics.update(old_log_prob_metrics) + old_log_prob.batch.pop("entropys") + batch = batch.union(old_log_prob) + + if "rollout_log_probs" in batch.batch.keys(): + # TODO: we may want to add diff of probs too. + rollout_old_log_probs = batch.batch["rollout_log_probs"] + actor_old_log_probs = batch.batch["old_log_probs"] + attention_mask = batch.batch["attention_mask"] + responses = batch.batch["responses"] + response_length = responses.size(1) + response_mask = attention_mask[:, -response_length:] + + rollout_probs = torch.exp(rollout_old_log_probs) + actor_probs = torch.exp(actor_old_log_probs) + rollout_probs_diff = torch.abs(rollout_probs - actor_probs) + rollout_probs_diff = torch.masked_select(rollout_probs_diff, response_mask.bool()) + rollout_probs_diff_max = torch.max(rollout_probs_diff) + rollout_probs_diff_mean = torch.mean(rollout_probs_diff) + rollout_probs_diff_std = torch.std(rollout_probs_diff) + metrics.update( + { + "training/rollout_probs_diff_max": rollout_probs_diff_max.detach().item(), + "training/rollout_probs_diff_mean": rollout_probs_diff_mean.detach().item(), + "training/rollout_probs_diff_std": rollout_probs_diff_std.detach().item(), + } + ) + + if self.use_reference_policy: + # compute reference log_prob + with marked_timer("ref", timing_raw, color="olive"): + if not self.ref_in_actor: + ref_log_prob = self.ref_policy_wg.compute_ref_log_prob(batch) + else: + ref_log_prob = self.actor_wg.compute_ref_log_prob(batch) + batch = batch.union(ref_log_prob) + + # compute values + if self.use_critic: + with marked_timer("values", timing_raw, color="cyan"): + values = self.critic_wg.compute_values(batch) + batch = batch.union(values) + + with marked_timer("adv", timing_raw, color="brown"): + # we combine with rule-based rm + reward_extra_infos_dict: dict[str, list] + if self.config.reward_model.launch_reward_fn_async: + reward_tensor, reward_extra_infos_dict = ray.get(future_reward) + batch.batch["token_level_scores"] = reward_tensor + + if reward_extra_infos_dict: + batch.non_tensor_batch.update({k: np.array(v) for k, v in reward_extra_infos_dict.items()}) + + # compute rewards. apply_kl_penalty if available + if self.config.algorithm.use_kl_in_reward: + batch, kl_metrics = apply_kl_penalty( + batch, kl_ctrl=self.kl_ctrl_in_reward, kl_penalty=self.config.algorithm.kl_penalty + ) + metrics.update(kl_metrics) + else: + batch.batch["token_level_rewards"] = batch.batch["token_level_scores"] + + # compute advantages, executed on the driver process + + norm_adv_by_std_in_grpo = self.config.algorithm.get( + "norm_adv_by_std_in_grpo", True + ) # GRPO adv normalization factor + + batch = compute_advantage( + batch, + adv_estimator=self.config.algorithm.adv_estimator, + gamma=self.config.algorithm.gamma, + lam=self.config.algorithm.lam, + num_repeat=self.config.actor_rollout_ref.rollout.n, + norm_adv_by_std_in_grpo=norm_adv_by_std_in_grpo, + config=self.config.algorithm, + ) + + # update critic + if self.use_critic: + with marked_timer("update_critic", timing_raw, color="pink"): + critic_output = self.critic_wg.update_critic(batch) + critic_output_metrics = reduce_metrics(critic_output.meta_info["metrics"]) + metrics.update(critic_output_metrics) + + # implement critic warmup + if self.config.trainer.critic_warmup <= self.global_steps: + # update actor + with marked_timer("update_actor", timing_raw, color="red"): + batch.meta_info["multi_turn"] = self.config.actor_rollout_ref.rollout.multi_turn.enable + actor_output = self.actor_wg.update_actor(batch) + actor_output_metrics = reduce_metrics(actor_output.meta_info["metrics"]) + metrics.update(actor_output_metrics) + + # Log rollout generations if enabled + rollout_data_dir = self.config.trainer.get("rollout_data_dir", None) + if rollout_data_dir: + with marked_timer("dump_rollout_generations", timing_raw, color="green"): + inputs = self.tokenizer.batch_decode(batch.batch["prompts"], skip_special_tokens=True) + outputs = self.tokenizer.batch_decode(batch.batch["responses"], skip_special_tokens=True) + scores = batch.batch["token_level_scores"].sum(-1).cpu().tolist() + self._dump_generations( + inputs=inputs, + outputs=outputs, + scores=scores, + reward_extra_infos_dict=reward_extra_infos_dict, + dump_path=rollout_data_dir, + ) + + # validate + if ( + self.val_reward_fn is not None + and self.config.trainer.test_freq > 0 + and (is_last_step or self.global_steps % self.config.trainer.test_freq == 0) + ): + with marked_timer("testing", timing_raw, color="green"): + val_metrics: dict = self._validate() + if is_last_step: + last_val_metrics = val_metrics + metrics.update(val_metrics) + + if self.config.trainer.save_freq > 0 and ( + is_last_step or self.global_steps % self.config.trainer.save_freq == 0 + ): + with marked_timer("save_checkpoint", timing_raw, color="green"): + self._save_checkpoint() + + # training metrics + metrics.update( + { + "training/global_step": self.global_steps, + "training/epoch": epoch, + } + ) + # collect metrics + metrics.update(compute_data_metrics(batch=batch, use_critic=self.use_critic)) + metrics.update(compute_timing_metrics(batch=batch, timing_raw=timing_raw)) + # TODO: implement actual tflpo and theoretical tflpo + n_gpus = self.resource_pool_manager.get_n_gpus() + metrics.update(compute_throughout_metrics(batch=batch, timing_raw=timing_raw, n_gpus=n_gpus)) # TODO: make a canonical logger that supports various backend logger.log(data=metrics, step=self.global_steps) @@ -323,6 +607,17 @@ def fit(self): progress_bar.update(1) self.global_steps += 1 + if do_profile: + self.actor_wg.stop_profile() + if not self.hybrid_engine: + self.rollout_wg.stop_profile() + if self.use_reference_policy: + self.ref_policy_wg.stop_profile() + if self.use_critic: + self.critic_wg.stop_profile() + if self.use_rm: + self.rm_wg.stop_profile() + if is_last_step: pprint(f"Final validation metrics: {last_val_metrics}") progress_bar.close() diff --git a/recipe/one_step_off_policy/vllm_sharding_manager.py b/recipe/one_step_off_policy/vllm_sharding_manager.py new file mode 100644 index 00000000000..c33ba585470 --- /dev/null +++ b/recipe/one_step_off_policy/vllm_sharding_manager.py @@ -0,0 +1,74 @@ +# Copyright 2025 Bytedance Ltd. and/or its affiliates +# Copyright 2025 Meituan Ltd. and/or its affiliates +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import logging +import os + +from torch.distributed.device_mesh import DeviceMesh + +from verl import DataProto +from verl.protocol import all_gather_data_proto +from verl.third_party.vllm import parallel_state as vllm_ps +from verl.utils.debug import GPUMemoryLogger +from verl.utils.device import get_torch_device +from verl.utils.torch_functional import check_device_is_available +from verl.workers.sharding_manager.base import BaseShardingManager + +logger = logging.getLogger(__file__) +logger.setLevel(os.getenv("VERL_LOGGING_LEVEL", "WARN")) + + +class VLLMShardingManager(BaseShardingManager): + @check_device_is_available() + def __init__(self, inference_engine, device_mesh: DeviceMesh): + self.device_mesh = device_mesh + self.inference_engine = inference_engine + inference_engine.wake_up() + assert device_mesh is not None + assert inference_engine is not None + self.tp_size = self.device_mesh["infer_tp"].size() + self.tp_rank = self.device_mesh["infer_tp"].get_local_rank() + self.timing = {} + gen_dp_rank = self.device_mesh["dp"].get_local_rank() + get_torch_device().manual_seed(gen_dp_rank + 1000) + self.gen_random_states = get_torch_device().get_rng_state() + + @GPUMemoryLogger(role="vllm sharding_manager", logger=logger) + def __enter__(self): + get_torch_device().set_rng_state(self.gen_random_states) + + @GPUMemoryLogger(role="vllm sharding_manager", logger=logger) + def __exit__(self, exc_type, exc_value, traceback): + self.gen_random_states = get_torch_device().get_rng_state() + self.inference_engine.reset_prefix_cache() + + @GPUMemoryLogger(role="vllm sharding_manager", logger=logger) + def preprocess_data(self, data: DataProto) -> DataProto: + """All gather across tp group to make each rank has identical input.""" + if self.tp_size == 1: + return data + + group = vllm_ps.get_tensor_model_parallel_group().device_group + + all_gather_data_proto(data=data, process_group=group) + return data + + @GPUMemoryLogger(role="vllm sharding_manager", logger=logger) + def postprocess_data(self, data: DataProto) -> DataProto: + """Get chunk data of this tp rank since we do all gather in preprocess.""" + if self.tp_size == 1: + return data + + return data.chunk(chunks=self.tp_size)[self.tp_rank] diff --git a/tests/special_e2e/run_fully_async_policy.sh b/tests/special_e2e/run_fully_async_policy.sh index 142ee3e8806..63cfcf622a6 100644 --- a/tests/special_e2e/run_fully_async_policy.sh +++ b/tests/special_e2e/run_fully_async_policy.sh @@ -59,7 +59,7 @@ train_prompt_mini_bsz=32 total_rollout_steps=$(((128*2))) test_freq=10 staleness_threshold=1 -trigger_parameter_sync_step=1 +trigger_parameter_sync_step=16 partial_rollout=True exp_name="$(basename "${MODEL_ID,,}")-fully-async-policy-${ACTOR_STRATEGY}-minimal" From aa57cd48a150b3e968d88a8b66f217f25097b3e7 Mon Sep 17 00:00:00 2001 From: wangshulin02 <953550366@qq.com> Date: Fri, 5 Sep 2025 17:32:39 +0800 Subject: [PATCH 099/182] fix some metrics aggregate --- recipe/fully_async_policy/detach_utils.py | 129 ++++++------------ .../fully_async_policy/fully_async_trainer.py | 4 +- 2 files changed, 41 insertions(+), 92 deletions(-) diff --git a/recipe/fully_async_policy/detach_utils.py b/recipe/fully_async_policy/detach_utils.py index 48a41443612..e01d82c1726 100644 --- a/recipe/fully_async_policy/detach_utils.py +++ b/recipe/fully_async_policy/detach_utils.py @@ -234,7 +234,7 @@ def assemble_batch_from_rollout_samples( class MetricsAggregator: """Metrics aggregator, used to combine metrics from multiple training steps""" - def __init__(self): + def __init__(self, total_gpus: int): # Store all values ​​for each metric self.metric_values: Dict[str, List[float]] = defaultdict(list) # Store the number of samples at each step for weighted averaging @@ -243,6 +243,8 @@ def __init__(self): self.timestamps: List[float] = [] # Step Count self.step_count = 0 + # total num gpus used + self.total_gpus = total_gpus # Metric aggregation rule configuration self.aggregation_rules = self._init_aggregation_rules() @@ -250,57 +252,7 @@ def __init__(self): def _init_aggregation_rules(self) -> Dict[str, Dict[str, List[str]]]: """Initialize metrics aggregation rules""" return { - # # Cumulative metrics - take the last value - # 'last': [ - # 'fully_async/stale_samples_processed', - # 'fully_async/current_param_version', - # 'global_steps', - # 'epoch', - # ], - - # # Weighted average metrics - weighted by sample size - # 'weighted_avg': [ - # 'fully_async/stale_samples_ratio', - # 'policy_loss', - # 'value_loss', - # 'entropy_loss', - # 'kl_divergence', - # 'advantage_mean', - # 'advantage_std', - # 'learning_rate', - # ], - - # # Summation type metrics - direct accumulation - # 'sum': [ - # 'fully_async/total_wait_time', - # 'processed_samples', - # 'total_tokens', - # ], - - # Average metrics - Simple Average - # 'avg': [ - # 'perf/throughput', - # 'fully_async/avg_processing_time', - # 'fully_async/tp50_processing_time', - # 'fully_async/tp95_processing_time', - # 'fully_async/tp99_processing_time', - # 'grad_norm', - # ], - - # # Maximum value metrics - # 'max': [ - # 'fully_async/max_processing_time', - # 'max_grad_norm', - # 'peak_memory_usage', - # ], - - # # Minimum value metrics - # 'min': [ - # 'fully_async/min_processing_time', - # 'min_learning_rate', - # ], - - # Time-Based metrics - Special Treatment + # Time-Based metrics, can add metrics here 'time_sum': [ 'timing_s/adv', 'timing_s/gen', @@ -332,35 +284,26 @@ def _get_aggregation_type(self, metric_name: str) -> str: for agg_type, metric_list in self.aggregation_rules.items(): if metric_name in metric_list: return agg_type + + metric_lower = metric_name.lower() + if any(keyword in metric_lower for keyword in ['timing_s/']): + return 'time_sum' + if any(keyword in metric_lower for keyword in ['mean', 'avg', 'average']): + return 'avg' + if any(keyword in metric_lower for keyword in ['max', 'maximum']): + return 'max' + if any(keyword in metric_lower for keyword in ['min', 'minimum']): + return 'min' + if any(keyword in metric_lower for keyword in ['sum', 'total']): + return 'sum' + if any(keyword in metric_lower for keyword in ['weighted_avg']): + return 'weighted_avg' + import warnings warnings.warn(f"No aggregation rule is matched in init_aggregation_rules. \ - For metric {metric_name}, the 'last' method is used") - return 'last' + For metric {metric_name}, the 'avg' method is used") + return 'avg' - # raise ValueError(f"No aggregation rule is matched in init_aggregation_rules. \ - # Metric name: {metric_name}") # TODO: 删除 - - - # Aggregation rules based on naming patterns - if metric_name.startswith('time/'): - aggregation_type = 'time_sum' - elif metric_name.endswith('_ratio') or metric_name.endswith('_rate'): - aggregation_type = 'weighted_avg' - elif metric_name.endswith('_count') or metric_name.endswith('_total'): - aggregation_type = 'sum' - elif metric_name.startswith('max_') or metric_name.endswith('_max'): - aggregation_type = 'max' - elif metric_name.startswith('min_') or metric_name.endswith('_min'): - aggregation_type = 'min' - else: - # The default is weighted average. - aggregation_type = 'weighted_avg' - import warnings - warnings.simplefilter("always", DeprecationWarning) - warnings.warn("No aggregation rule is matched in init_aggregation_rules. \ - Aggregation rule is matched based on name prefix:", aggregation_type) - return aggregation_type - def _aggregate_single_metric(self, metric_name: str, values: List[float]) -> float: """Aggregating a single metric""" if not values: @@ -402,6 +345,7 @@ def _aggregate_single_metric(self, metric_name: str, values: List[float]) -> flo def get_aggregated_metrics(self) -> Dict[str, Any]: """aggregated metrics""" + t = time.time() if self.step_count == 0: return {} @@ -411,21 +355,24 @@ def get_aggregated_metrics(self) -> Dict[str, Any]: for metric_name, values in self.metric_values.items(): aggregated[metric_name] = self._aggregate_single_metric(metric_name, values) - # # Adding aggregate statistics - # aggregated.update({ - # 'aggregation/step_count': self.step_count, - # 'aggregation/total_samples': sum(self.sample_counts), - # 'aggregation/avg_samples_per_step': sum(self.sample_counts) / self.step_count, - # 'aggregation/time_span': self.timestamps[-1] - self.timestamps[0] if len(self.timestamps) > 1 else 0, - # }) + # Aggregate special metrics + aggregated = self._special_metrics_aggergate(aggregated) + + print(f"******************************aggregated metrics done. cost {time.time() - t}") - # # Add statistics on sample size - # if self.sample_counts: - # aggregated.update({ - # 'aggregation/min_samples_per_step': min(self.sample_counts), - # 'aggregation/max_samples_per_step': max(self.sample_counts), - # }) + return aggregated + + def _special_metrics_aggergate(self, aggregated: Dict[str, Any]) -> Dict[str, Any]: + """calculate special metrics""" + + if "global_seqlen/minmax_diff" in aggregated.keys(): + aggregated["global_seqlen/minmax_diff"] = aggregated["global_seqlen/max"] - aggregated["global_seqlen/min"] + REQUIRED_PERF_KEYS = {"perf/throughput", "perf/total_num_tokens", "perf/time_per_step"} + if REQUIRED_PERF_KEYS.issubset(aggregated): + aggregated["perf/throughput"] = aggregated['perf/total_num_tokens'] / \ + (aggregated["perf/time_per_step"] * self.total_gpus) + return aggregated def reset(self): diff --git a/recipe/fully_async_policy/fully_async_trainer.py b/recipe/fully_async_policy/fully_async_trainer.py index 9276d148b66..0a200e76b1d 100644 --- a/recipe/fully_async_policy/fully_async_trainer.py +++ b/recipe/fully_async_policy/fully_async_trainer.py @@ -121,7 +121,9 @@ def __init__( self.required_samples = int( self.minimal_bsz * config.actor_rollout_ref.actor.ppo_mini_batch_size / config.actor_rollout_ref.rollout.n ) - self.metrics_aggregator = MetricsAggregator() + total_gpus = config.trainer.nnodes * config.trainer.n_gpus_per_node + \ + config.rollout.nnodes * config.rollout.n_gpus_per_node + self.metrics_aggregator = MetricsAggregator(total_gpus=total_gpus) def set_message_queue_client(self, message_queue_client: MessageQueueClient): """Set message queue client""" From 570eb3ba9fda31f35118e99cc407dc36ca87a61b Mon Sep 17 00:00:00 2001 From: wangshulin02 <953550366@qq.com> Date: Fri, 5 Sep 2025 17:53:20 +0800 Subject: [PATCH 100/182] temporarily fix log_prob --- verl/trainer/ppo/ray_trainer.py | 75 +++++++++++++++++---------------- 1 file changed, 39 insertions(+), 36 deletions(-) diff --git a/verl/trainer/ppo/ray_trainer.py b/verl/trainer/ppo/ray_trainer.py index e61b1dc5fe0..42c728fa79d 100644 --- a/verl/trainer/ppo/ray_trainer.py +++ b/verl/trainer/ppo/ray_trainer.py @@ -1247,42 +1247,45 @@ def _process_batch_common(self, batch, metrics, timing_raw): reward_tensor, reward_extra_infos_dict = compute_reward(batch, self.reward_fn) # recompute old_log_probs with marked_timer("old_log_prob", timing_raw, color="blue"): - old_log_prob = self.actor_rollout_wg.compute_log_prob(batch) - entropys = old_log_prob.batch["entropys"] - response_masks = batch.batch["response_mask"] - loss_agg_mode = self.config.actor_rollout_ref.actor.loss_agg_mode - entropy_agg = agg_loss(loss_mat=entropys, loss_mask=response_masks, loss_agg_mode=loss_agg_mode) - old_log_prob_metrics = {"actor/entropy": entropy_agg.detach().item()} - metrics.update(old_log_prob_metrics) - old_log_prob.batch.pop("entropys") - batch = batch.union(old_log_prob) - - if "rollout_log_probs" in batch.batch.keys(): - # TODO: we may want to add diff of probs too. - rollout_old_log_probs = batch.batch["rollout_log_probs"] - actor_old_log_probs = batch.batch["old_log_probs"] - attention_mask = batch.batch["attention_mask"] - responses = batch.batch["responses"] - response_length = responses.size(1) - response_mask = attention_mask[:, -response_length:] - - rollout_probs = torch.exp(rollout_old_log_probs) - actor_probs = torch.exp(actor_old_log_probs) - rollout_probs_diff = torch.abs(rollout_probs - actor_probs) - rollout_probs_diff = torch.masked_select(rollout_probs_diff, response_mask.bool()) - rollout_probs_diff_max = torch.max(rollout_probs_diff) - rollout_probs_diff_mean = torch.mean(rollout_probs_diff) - rollout_probs_diff_std = torch.std(rollout_probs_diff) - metrics.update( - { - "training/rollout_probs_diff_max": rollout_probs_diff_max.detach().item(), - "training/rollout_probs_diff_mean": rollout_probs_diff_mean.detach().item(), - "training/rollout_probs_diff_std": rollout_probs_diff_std.detach().item(), - } - ) - if self.config.async_training and self.config.async_training.use_rollout_log_probs: - batch.batch["old_log_probs"] = batch.batch["rollout_log_probs"] - del actor_old_log_probs + if self.config.async_training and self.config.async_training.use_rollout_log_probs: + batch.batch["old_log_probs"] = batch.batch["rollout_log_probs"] + del actor_old_log_probs + else: + + old_log_prob = self.actor_rollout_wg.compute_log_prob(batch) + entropys = old_log_prob.batch["entropys"] + response_masks = batch.batch["response_mask"] + loss_agg_mode = self.config.actor_rollout_ref.actor.loss_agg_mode + entropy_agg = agg_loss(loss_mat=entropys, loss_mask=response_masks, loss_agg_mode=loss_agg_mode) + old_log_prob_metrics = {"actor/entropy": entropy_agg.detach().item()} + metrics.update(old_log_prob_metrics) + old_log_prob.batch.pop("entropys") + batch = batch.union(old_log_prob) + + if "rollout_log_probs" in batch.batch.keys(): + # TODO: we may want to add diff of probs too. + rollout_old_log_probs = batch.batch["rollout_log_probs"] + actor_old_log_probs = batch.batch["old_log_probs"] + attention_mask = batch.batch["attention_mask"] + responses = batch.batch["responses"] + response_length = responses.size(1) + response_mask = attention_mask[:, -response_length:] + + rollout_probs = torch.exp(rollout_old_log_probs) + actor_probs = torch.exp(actor_old_log_probs) + rollout_probs_diff = torch.abs(rollout_probs - actor_probs) + rollout_probs_diff = torch.masked_select(rollout_probs_diff, response_mask.bool()) + rollout_probs_diff_max = torch.max(rollout_probs_diff) + rollout_probs_diff_mean = torch.mean(rollout_probs_diff) + rollout_probs_diff_std = torch.std(rollout_probs_diff) + metrics.update( + { + "training/rollout_probs_diff_max": rollout_probs_diff_max.detach().item(), + "training/rollout_probs_diff_mean": rollout_probs_diff_mean.detach().item(), + "training/rollout_probs_diff_std": rollout_probs_diff_std.detach().item(), + } + ) + if self.use_reference_policy: # compute reference log_prob From 5a85685bf472fab84d3719664cf456097426b588 Mon Sep 17 00:00:00 2001 From: wangshulin02 <953550366@qq.com> Date: Fri, 5 Sep 2025 17:54:17 +0800 Subject: [PATCH 101/182] add exp folder --- .../dapo_7b_math_fsdp2_colocate.sh | 137 ++++++++++++++ .../fsdp2_colocate/runtime_env.yaml | 5 + ..._fsdp2_fully-async_8-24_mbs32_tpf_tfq20.sh | 174 ++++++++++++++++++ .../fsdp2_fully-async_16-16/runtime_env.yaml | 5 + ..._fsdp2_fully-async_8-24_mbs32_tpf_tfq20.sh | 174 ++++++++++++++++++ .../fsdp2_fully-async_24-8/runtime_env.yaml | 5 + ..._fsdp2_fully-async_8-24_mbs32_tpf_tfq20.sh | 174 ++++++++++++++++++ .../fsdp2_fully-async_8-24/runtime_env.yaml | 5 + 8 files changed, 679 insertions(+) create mode 100644 recipe/fully_async_policy/exp/qwen2-7B-math/fsdp2_colocate/dapo_7b_math_fsdp2_colocate.sh create mode 100644 recipe/fully_async_policy/exp/qwen2-7B-math/fsdp2_colocate/runtime_env.yaml create mode 100644 recipe/fully_async_policy/exp/qwen2-7B-math/fsdp2_fully-async_16-16/dapo_qwen2-7B-math_28k_fsdp2_fully-async_8-24_mbs32_tpf_tfq20.sh create mode 100644 recipe/fully_async_policy/exp/qwen2-7B-math/fsdp2_fully-async_16-16/runtime_env.yaml create mode 100644 recipe/fully_async_policy/exp/qwen2-7B-math/fsdp2_fully-async_24-8/dapo_qwen2-7B-math_28k_fsdp2_fully-async_8-24_mbs32_tpf_tfq20.sh create mode 100644 recipe/fully_async_policy/exp/qwen2-7B-math/fsdp2_fully-async_24-8/runtime_env.yaml create mode 100644 recipe/fully_async_policy/exp/qwen2-7B-math/fsdp2_fully-async_8-24/dapo_qwen2-7B-math_28k_fsdp2_fully-async_8-24_mbs32_tpf_tfq20.sh create mode 100644 recipe/fully_async_policy/exp/qwen2-7B-math/fsdp2_fully-async_8-24/runtime_env.yaml diff --git a/recipe/fully_async_policy/exp/qwen2-7B-math/fsdp2_colocate/dapo_7b_math_fsdp2_colocate.sh b/recipe/fully_async_policy/exp/qwen2-7B-math/fsdp2_colocate/dapo_7b_math_fsdp2_colocate.sh new file mode 100644 index 00000000000..8b627fd6eed --- /dev/null +++ b/recipe/fully_async_policy/exp/qwen2-7B-math/fsdp2_colocate/dapo_7b_math_fsdp2_colocate.sh @@ -0,0 +1,137 @@ +#!/usr/bin/env bash +set -xeuo pipefail + +project_name='DAPO' +exp_name='dapo_qwen2-7B-math_28k_fsdp2_colocate_32_mbs32_tfq20' + +adv_estimator=grpo + +use_kl_in_reward=False +kl_coef=0.0 +use_kl_loss=False +kl_loss_coef=0.0 + +clip_ratio_low=0.2 +clip_ratio_high=0.28 + +max_prompt_length=$((1024 * 2)) +max_response_length=$((1024 * 28)) +enable_overlong_buffer=True +overlong_buffer_len=$((1024 * 4)) +overlong_penalty_factor=1.0 + +loss_agg_mode="token-mean" + +train_prompt_bsz=512 +n_resp_per_prompt=16 +train_prompt_mini_bsz=32 + +# Ray +# RAY_ADDRESS=${RAY_ADDRESS:-"http://localhost:8265"} +# WORKING_DIR=${WORKING_DIR:-"${PWD}"} +# RUNTIME_ENV=${RUNTIME_ENV:-"${WORKING_DIR}/verl/trainer/runtime_env.yaml"} +NNODES=${NNODES:-4} +NGPUS_PER_NODE=${NGPUS_PER_NODE:-8} +# Paths +RAY_DATA_HOME=${RAY_DATA_HOME:-"${HOME}/verl"} +# very important! please modify the max_position_embeddings in config.json to 32768 after downloading from huggingface +MODEL_PATH=${MODEL_PATH:-"${RAY_DATA_HOME}/models/Qwen2.5-Math-7B"} +CKPTS_DIR=${CKPTS_DIR:-"${RAY_DATA_HOME}/ckpts/${project_name}/${exp_name}"} +TRAIN_FILE=${TRAIN_FILE:-"${RAY_DATA_HOME}/data/dapo-math-17k.parquet"} +TEST_FILE=${TEST_FILE:-"${RAY_DATA_HOME}/data/aime-2024.parquet"} + +MODEL_PATH=/mnt/dolphinfs/ssd_pool/docker/user/hadoop-friday-studio/FTI/houzhenggang/model/Qwen2___5-Math-7B +CKPTS_DIR=./ckpts/${project_name}/${exp_name} +TRAIN_FILE=/mnt/dolphinfs/ssd_pool/docker/user/hadoop-friday-studio/FTI/houzhenggang/data/dapo/dapo-math-17k.parquet +TEST_FILE=/mnt/dolphinfs/ssd_pool/docker/user/hadoop-friday-studio/FTI/houzhenggang/data/dapo/aime-2024.parquet +# Algorithm +temperature=1.0 +top_p=1.0 +top_k=-1 # 0 for HF rollout, -1 for vLLM rollout +val_top_p=0.7 + +# Performance Related Parameter +use_dynamic_bsz=True +actor_ppo_max_token_len=$(((max_prompt_length + max_response_length) * 2)) +infer_ppo_max_token_len=$(((max_prompt_length + max_response_length) * 3)) +offload=True +gen_tp=4 +sp_size=4 +fsdp_size=2 + +# reference run wandb: https://wandb.ai/verl-org/DAPO%20Reproduction%20on%20verl/runs/ow47vvon?nw=nwusertongyuxuan361 + + +python -m verl.trainer.main_ppo \ + data.train_files="${TRAIN_FILE}" \ + data.val_files="${TEST_FILE}" \ + data.prompt_key=prompt \ + data.truncation='left' \ + data.max_prompt_length=${max_prompt_length} \ + data.max_response_length=${max_response_length} \ + data.train_batch_size=${train_prompt_bsz} \ + actor_rollout_ref.rollout.n=${n_resp_per_prompt} \ + algorithm.adv_estimator=${adv_estimator} \ + algorithm.use_kl_in_reward=${use_kl_in_reward} \ + algorithm.kl_ctrl.kl_coef=${kl_coef} \ + actor_rollout_ref.actor.strategy=fsdp2 \ + critic.strategy=fsdp2 \ + actor_rollout_ref.actor.use_kl_loss=${use_kl_loss} \ + actor_rollout_ref.actor.kl_loss_coef=${kl_loss_coef} \ + actor_rollout_ref.actor.clip_ratio_low=${clip_ratio_low} \ + actor_rollout_ref.actor.clip_ratio_high=${clip_ratio_high} \ + actor_rollout_ref.actor.clip_ratio_c=10.0 \ + actor_rollout_ref.model.use_remove_padding=True \ + +actor_rollout_ref.model.override_config.max_position_embeddings=32768 \ + actor_rollout_ref.actor.use_dynamic_bsz=${use_dynamic_bsz} \ + actor_rollout_ref.ref.log_prob_use_dynamic_bsz=${use_dynamic_bsz} \ + actor_rollout_ref.rollout.log_prob_use_dynamic_bsz=${use_dynamic_bsz} \ + actor_rollout_ref.actor.ppo_max_token_len_per_gpu=${actor_ppo_max_token_len} \ + actor_rollout_ref.ref.log_prob_max_token_len_per_gpu=${infer_ppo_max_token_len} \ + actor_rollout_ref.rollout.log_prob_max_token_len_per_gpu=${infer_ppo_max_token_len} \ + actor_rollout_ref.model.path="${MODEL_PATH}" \ + actor_rollout_ref.model.enable_gradient_checkpointing=True \ + actor_rollout_ref.actor.optim.lr=1e-6 \ + actor_rollout_ref.actor.optim.lr_warmup_steps=10 \ + actor_rollout_ref.actor.optim.weight_decay=0.1 \ + actor_rollout_ref.actor.ppo_mini_batch_size=${train_prompt_mini_bsz} \ + actor_rollout_ref.actor.fsdp_config.param_offload=${offload} \ + actor_rollout_ref.actor.fsdp_config.optimizer_offload=${offload} \ + actor_rollout_ref.actor.entropy_coeff=0 \ + actor_rollout_ref.actor.grad_clip=1.0 \ + actor_rollout_ref.actor.loss_agg_mode=${loss_agg_mode} \ + actor_rollout_ref.actor.ulysses_sequence_parallel_size=${sp_size} \ + actor_rollout_ref.rollout.gpu_memory_utilization=0.80 \ + actor_rollout_ref.rollout.tensor_model_parallel_size=${gen_tp} \ + actor_rollout_ref.rollout.enable_chunked_prefill=True \ + actor_rollout_ref.rollout.max_num_batched_tokens=$((max_prompt_length + max_response_length)) \ + actor_rollout_ref.rollout.temperature=${temperature} \ + actor_rollout_ref.rollout.top_p=${top_p} \ + actor_rollout_ref.rollout.top_k=${top_k} \ + actor_rollout_ref.rollout.val_kwargs.temperature=${temperature} \ + actor_rollout_ref.rollout.val_kwargs.top_p=${val_top_p} \ + actor_rollout_ref.rollout.val_kwargs.top_k=${top_k} \ + actor_rollout_ref.rollout.val_kwargs.do_sample=True \ + actor_rollout_ref.rollout.val_kwargs.n=1 \ + actor_rollout_ref.ref.fsdp_config.param_offload=${offload} \ + actor_rollout_ref.ref.ulysses_sequence_parallel_size=${sp_size} \ + actor_rollout_ref.actor.fsdp_config.fsdp_size=${fsdp_size} \ + reward_model.reward_manager=dapo \ + +reward_model.reward_kwargs.overlong_buffer_cfg.enable=${enable_overlong_buffer} \ + +reward_model.reward_kwargs.overlong_buffer_cfg.len=${overlong_buffer_len} \ + +reward_model.reward_kwargs.overlong_buffer_cfg.penalty_factor=${overlong_penalty_factor} \ + +reward_model.reward_kwargs.overlong_buffer_cfg.log=False \ + +reward_model.reward_kwargs.max_resp_len=${max_response_length} \ + trainer.logger=['console','tensorboard'] \ + trainer.project_name="${project_name}" \ + trainer.experiment_name="${exp_name}" \ + trainer.n_gpus_per_node="${NGPUS_PER_NODE}" \ + trainer.nnodes="${NNODES}" \ + trainer.val_before_train=True \ + trainer.test_freq=20 \ + trainer.save_freq=-1 \ + trainer.total_epochs=10 \ + trainer.total_training_steps=400 \ + trainer.default_local_dir="${CKPTS_DIR}" \ + trainer.resume_mode=auto \ + trainer.log_val_generations=10 diff --git a/recipe/fully_async_policy/exp/qwen2-7B-math/fsdp2_colocate/runtime_env.yaml b/recipe/fully_async_policy/exp/qwen2-7B-math/fsdp2_colocate/runtime_env.yaml new file mode 100644 index 00000000000..dcca08e67f7 --- /dev/null +++ b/recipe/fully_async_policy/exp/qwen2-7B-math/fsdp2_colocate/runtime_env.yaml @@ -0,0 +1,5 @@ +env_vars: + VLLM_USE_V1: "1" + TENSORBOARD_DIR: "/mnt/dolphinfs/hdd_pool/docker/user/hadoop-djst-algoplat/houzhenggang/tensorboard/dapo_7b_math_fsdp2_4_12" + NCCL_DEBUG: "INFO" + HYDRA_FULL_ERROR: "1" \ No newline at end of file diff --git a/recipe/fully_async_policy/exp/qwen2-7B-math/fsdp2_fully-async_16-16/dapo_qwen2-7B-math_28k_fsdp2_fully-async_8-24_mbs32_tpf_tfq20.sh b/recipe/fully_async_policy/exp/qwen2-7B-math/fsdp2_fully-async_16-16/dapo_qwen2-7B-math_28k_fsdp2_fully-async_8-24_mbs32_tpf_tfq20.sh new file mode 100644 index 00000000000..618497c0257 --- /dev/null +++ b/recipe/fully_async_policy/exp/qwen2-7B-math/fsdp2_fully-async_16-16/dapo_qwen2-7B-math_28k_fsdp2_fully-async_8-24_mbs32_tpf_tfq20.sh @@ -0,0 +1,174 @@ +#!/usr/bin/env bash +set -xeuo pipefail +# dapo_qwen2-7B-math_28k_fsdp2_fully-async_8-24_mbs32_tpf_tfq20 +project_name='DAPO' +exp_name='dapo_qwen2-7B-math_28k_fsdp2_fully-async_8-24_mbs32_tpf_tfq20' + +# Ray +# RAY_ADDRESS=${RAY_ADDRESS:-"http://localhost:8265"} +# WORKING_DIR=${WORKING_DIR:-"${PWD}"} +# RUNTIME_ENV=${RUNTIME_ENV:-"${WORKING_DIR}/verl/trainer/runtime_env.yaml"} +# Paths +RAY_DATA_HOME=${RAY_DATA_HOME:-"${HOME}/verl"} +# very important! please modify the max_position_embeddings in config.json to 32768 after downloading from huggingface +MODEL_PATH=${MODEL_PATH:-"${RAY_DATA_HOME}/models/Qwen2.5-Math-7B"} +CKPTS_DIR=${CKPTS_DIR:-"${RAY_DATA_HOME}/ckpts/${project_name}/${exp_name}"} +TRAIN_FILE=${TRAIN_FILE:-"${RAY_DATA_HOME}/data/dapo-math-17k.parquet"} +TEST_FILE=${TEST_FILE:-"${RAY_DATA_HOME}/data/aime-2024.parquet"} + +# MODEL_PATH=/mnt/dolphinfs/hdd_pool/docker/user/hadoop-djst-algoplat/houzhenggang/modelscope/hub/models/Qwen/Qwen2___5-Math-7B +MODEL_PATH=/mnt/dolphinfs/ssd_pool/docker/user/hadoop-friday-studio/FTI/houzhenggang/model/Qwen2___5-Math-7B +CKPTS_DIR=./ckpts/${project_name}/${exp_name} +# TRAIN_FILE=/mnt/dolphinfs/hdd_pool/docker/user/hadoop-djst-algoplat/houzhenggang/data/dapo/dapo-math-17k.parquet +# TEST_FILE=/mnt/dolphinfs/hdd_pool/docker/user/hadoop-djst-algoplat/houzhenggang/data/dapo/aime-2024.parquet +TRAIN_FILE=/mnt/dolphinfs/ssd_pool/docker/user/hadoop-friday-studio/FTI/houzhenggang/data/dapo/dapo-math-17k.parquet +TEST_FILE=/mnt/dolphinfs/ssd_pool/docker/user/hadoop-friday-studio/FTI/houzhenggang/data/dapo/aime-2024.parquet + +rollout_mode="async" +rollout_name="vllm" # sglang or vllm +if [ "$rollout_mode" = "async" ]; then + export VLLM_USE_V1=1 + return_raw_chat="True" +fi + +# Algorithm parameters +adv_estimator=grpo + +use_kl_in_reward=False +kl_coef=0.0 +use_kl_loss=False +kl_loss_coef=0.0 + +clip_ratio_low=0.2 +clip_ratio_high=0.28 + +# Response length parameters +max_prompt_length=$((1024 * 2)) +max_response_length=$((1024 *8)) +enable_overlong_buffer=True +overlong_buffer_len=$((1024 * 4)) +overlong_penalty_factor=1.0 + +# Training parameters +loss_agg_mode="token-mean" + +# Algorithm +temperature=1.0 +top_p=1.0 +top_k=-1 # 0 for HF rollout, -1 for vLLM rollout +val_top_p=0.7 + +# Performance Related Parameter +use_dynamic_bsz=True +actor_ppo_max_token_len=$(((max_prompt_length + max_response_length) * 2)) +infer_ppo_max_token_len=$(((max_prompt_length + max_response_length) * 3)) +ref_offload=True +actor_offload=False +gen_tp=1 +sp_size=1 +fsdp_size=2 + +# Fully async specific parameters +NNODES=${NNODES:-2} +NGPUS_PER_NODE=${NGPUS_PER_NODE:-8} + +n_gpus_rollout=6 +n_gpus_training=$((NGPUS_PER_NODE - n_gpus_rollout)) + +train_prompt_bsz=0 +gen_prompt_bsz=1 +n_resp_per_prompt=16 +train_prompt_mini_bsz=32 +total_rollout_steps=$(((512*100))) +test_freq=10 +staleness_threshold=1 +trigger_parameter_sync_step=64 +partial_rollout=True + +PYTHON_INTERPRETER="/home/hadoop-djst-algoplat/miniconda3/bin/python" +if [ ! -x "$PYTHON_INTERPRETER" ]; then + PYTHON_INTERPRETER="python3" +fi + +$PYTHON_INTERPRETER -m recipe.fully_async_policy.fully_async_main \ + data.train_files="${TRAIN_FILE}" \ + data.val_files="${TEST_FILE}" \ + data.prompt_key=prompt \ + data.truncation='left' \ + data.max_prompt_length=${max_prompt_length} \ + data.max_response_length=${max_response_length} \ + data.train_batch_size=${train_prompt_bsz} \ + data.gen_batch_size=${gen_prompt_bsz} \ + data.return_raw_chat=${return_raw_chat} \ + actor_rollout_ref.rollout.n=${n_resp_per_prompt} \ + algorithm.adv_estimator=${adv_estimator} \ + algorithm.use_kl_in_reward=${use_kl_in_reward} \ + algorithm.kl_ctrl.kl_coef=${kl_coef} \ + actor_rollout_ref.actor.strategy=fsdp2 \ + critic.strategy=fsdp2 \ + actor_rollout_ref.actor.use_kl_loss=${use_kl_loss} \ + actor_rollout_ref.actor.kl_loss_coef=${kl_loss_coef} \ + actor_rollout_ref.actor.clip_ratio_low=${clip_ratio_low} \ + actor_rollout_ref.actor.clip_ratio_high=${clip_ratio_high} \ + actor_rollout_ref.actor.clip_ratio_c=10.0 \ + actor_rollout_ref.model.use_remove_padding=True \ + actor_rollout_ref.hybrid_engine=False \ + +actor_rollout_ref.model.override_config.max_position_embeddings=32768 \ + actor_rollout_ref.actor.use_dynamic_bsz=${use_dynamic_bsz} \ + actor_rollout_ref.ref.log_prob_use_dynamic_bsz=${use_dynamic_bsz} \ + actor_rollout_ref.rollout.log_prob_use_dynamic_bsz=${use_dynamic_bsz} \ + actor_rollout_ref.actor.ppo_max_token_len_per_gpu=${actor_ppo_max_token_len} \ + actor_rollout_ref.ref.log_prob_max_token_len_per_gpu=${infer_ppo_max_token_len} \ + actor_rollout_ref.rollout.log_prob_max_token_len_per_gpu=${infer_ppo_max_token_len} \ + actor_rollout_ref.model.path="${MODEL_PATH}" \ + actor_rollout_ref.actor.optim.lr=1e-6 \ + actor_rollout_ref.actor.optim.lr_warmup_steps=10 \ + actor_rollout_ref.actor.optim.weight_decay=0.1 \ + actor_rollout_ref.actor.ppo_mini_batch_size=${train_prompt_mini_bsz} \ + actor_rollout_ref.actor.fsdp_config.param_offload=${actor_offload} \ + actor_rollout_ref.actor.fsdp_config.optimizer_offload=${actor_offload} \ + actor_rollout_ref.actor.entropy_coeff=0 \ + actor_rollout_ref.actor.grad_clip=1.0 \ + actor_rollout_ref.actor.loss_agg_mode=${loss_agg_mode} \ + actor_rollout_ref.actor.ulysses_sequence_parallel_size=${sp_size} \ + actor_rollout_ref.rollout.gpu_memory_utilization=0.80 \ + actor_rollout_ref.rollout.tensor_model_parallel_size=${gen_tp} \ + actor_rollout_ref.rollout.enable_chunked_prefill=True \ + actor_rollout_ref.rollout.max_num_batched_tokens=$((max_prompt_length + max_response_length)) \ + actor_rollout_ref.rollout.temperature=${temperature} \ + actor_rollout_ref.rollout.top_p=${top_p} \ + actor_rollout_ref.rollout.top_k=${top_k} \ + actor_rollout_ref.rollout.val_kwargs.temperature=${temperature} \ + actor_rollout_ref.rollout.val_kwargs.top_p=${val_top_p} \ + actor_rollout_ref.rollout.val_kwargs.top_k=${top_k} \ + actor_rollout_ref.rollout.val_kwargs.do_sample=True \ + actor_rollout_ref.rollout.val_kwargs.n=1 \ + actor_rollout_ref.ref.fsdp_config.param_offload=${ref_offload} \ + actor_rollout_ref.ref.ulysses_sequence_parallel_size=${sp_size} \ + actor_rollout_ref.actor.fsdp_config.fsdp_size=${fsdp_size} \ + actor_rollout_ref.rollout.name=${rollout_name} \ + actor_rollout_ref.rollout.mode=${rollout_mode} \ + actor_rollout_ref.rollout.calculate_log_probs=True \ + reward_model.reward_manager=dapo \ + +reward_model.reward_kwargs.overlong_buffer_cfg.enable=${enable_overlong_buffer} \ + +reward_model.reward_kwargs.overlong_buffer_cfg.len=${overlong_buffer_len} \ + +reward_model.reward_kwargs.overlong_buffer_cfg.penalty_factor=${overlong_penalty_factor} \ + +reward_model.reward_kwargs.overlong_buffer_cfg.log=False \ + +reward_model.reward_kwargs.max_resp_len=${max_response_length} \ + trainer.logger=['console','tensorboard'] \ + trainer.project_name="${project_name}" \ + trainer.experiment_name="${exp_name}" \ + trainer.val_before_train=True \ + trainer.save_freq=-1 \ + trainer.default_local_dir="${CKPTS_DIR}" \ + trainer.resume_mode=auto \ + trainer.nnodes="${NNODES}" \ + trainer.n_gpus_per_node="${n_gpus_training}" \ + rollout.nnodes="${NNODES}" \ + rollout.n_gpus_per_node="${n_gpus_rollout}" \ + rollout.total_rollout_steps="${total_rollout_steps}" \ + rollout.total_epochs=10 \ + rollout.test_freq="${test_freq}" \ + async_training.staleness_threshold="${staleness_threshold}" \ + async_training.trigger_parameter_sync_step="${trigger_parameter_sync_step}" \ + async_training.partial_rollout="${partial_rollout}" diff --git a/recipe/fully_async_policy/exp/qwen2-7B-math/fsdp2_fully-async_16-16/runtime_env.yaml b/recipe/fully_async_policy/exp/qwen2-7B-math/fsdp2_fully-async_16-16/runtime_env.yaml new file mode 100644 index 00000000000..dcca08e67f7 --- /dev/null +++ b/recipe/fully_async_policy/exp/qwen2-7B-math/fsdp2_fully-async_16-16/runtime_env.yaml @@ -0,0 +1,5 @@ +env_vars: + VLLM_USE_V1: "1" + TENSORBOARD_DIR: "/mnt/dolphinfs/hdd_pool/docker/user/hadoop-djst-algoplat/houzhenggang/tensorboard/dapo_7b_math_fsdp2_4_12" + NCCL_DEBUG: "INFO" + HYDRA_FULL_ERROR: "1" \ No newline at end of file diff --git a/recipe/fully_async_policy/exp/qwen2-7B-math/fsdp2_fully-async_24-8/dapo_qwen2-7B-math_28k_fsdp2_fully-async_8-24_mbs32_tpf_tfq20.sh b/recipe/fully_async_policy/exp/qwen2-7B-math/fsdp2_fully-async_24-8/dapo_qwen2-7B-math_28k_fsdp2_fully-async_8-24_mbs32_tpf_tfq20.sh new file mode 100644 index 00000000000..618497c0257 --- /dev/null +++ b/recipe/fully_async_policy/exp/qwen2-7B-math/fsdp2_fully-async_24-8/dapo_qwen2-7B-math_28k_fsdp2_fully-async_8-24_mbs32_tpf_tfq20.sh @@ -0,0 +1,174 @@ +#!/usr/bin/env bash +set -xeuo pipefail +# dapo_qwen2-7B-math_28k_fsdp2_fully-async_8-24_mbs32_tpf_tfq20 +project_name='DAPO' +exp_name='dapo_qwen2-7B-math_28k_fsdp2_fully-async_8-24_mbs32_tpf_tfq20' + +# Ray +# RAY_ADDRESS=${RAY_ADDRESS:-"http://localhost:8265"} +# WORKING_DIR=${WORKING_DIR:-"${PWD}"} +# RUNTIME_ENV=${RUNTIME_ENV:-"${WORKING_DIR}/verl/trainer/runtime_env.yaml"} +# Paths +RAY_DATA_HOME=${RAY_DATA_HOME:-"${HOME}/verl"} +# very important! please modify the max_position_embeddings in config.json to 32768 after downloading from huggingface +MODEL_PATH=${MODEL_PATH:-"${RAY_DATA_HOME}/models/Qwen2.5-Math-7B"} +CKPTS_DIR=${CKPTS_DIR:-"${RAY_DATA_HOME}/ckpts/${project_name}/${exp_name}"} +TRAIN_FILE=${TRAIN_FILE:-"${RAY_DATA_HOME}/data/dapo-math-17k.parquet"} +TEST_FILE=${TEST_FILE:-"${RAY_DATA_HOME}/data/aime-2024.parquet"} + +# MODEL_PATH=/mnt/dolphinfs/hdd_pool/docker/user/hadoop-djst-algoplat/houzhenggang/modelscope/hub/models/Qwen/Qwen2___5-Math-7B +MODEL_PATH=/mnt/dolphinfs/ssd_pool/docker/user/hadoop-friday-studio/FTI/houzhenggang/model/Qwen2___5-Math-7B +CKPTS_DIR=./ckpts/${project_name}/${exp_name} +# TRAIN_FILE=/mnt/dolphinfs/hdd_pool/docker/user/hadoop-djst-algoplat/houzhenggang/data/dapo/dapo-math-17k.parquet +# TEST_FILE=/mnt/dolphinfs/hdd_pool/docker/user/hadoop-djst-algoplat/houzhenggang/data/dapo/aime-2024.parquet +TRAIN_FILE=/mnt/dolphinfs/ssd_pool/docker/user/hadoop-friday-studio/FTI/houzhenggang/data/dapo/dapo-math-17k.parquet +TEST_FILE=/mnt/dolphinfs/ssd_pool/docker/user/hadoop-friday-studio/FTI/houzhenggang/data/dapo/aime-2024.parquet + +rollout_mode="async" +rollout_name="vllm" # sglang or vllm +if [ "$rollout_mode" = "async" ]; then + export VLLM_USE_V1=1 + return_raw_chat="True" +fi + +# Algorithm parameters +adv_estimator=grpo + +use_kl_in_reward=False +kl_coef=0.0 +use_kl_loss=False +kl_loss_coef=0.0 + +clip_ratio_low=0.2 +clip_ratio_high=0.28 + +# Response length parameters +max_prompt_length=$((1024 * 2)) +max_response_length=$((1024 *8)) +enable_overlong_buffer=True +overlong_buffer_len=$((1024 * 4)) +overlong_penalty_factor=1.0 + +# Training parameters +loss_agg_mode="token-mean" + +# Algorithm +temperature=1.0 +top_p=1.0 +top_k=-1 # 0 for HF rollout, -1 for vLLM rollout +val_top_p=0.7 + +# Performance Related Parameter +use_dynamic_bsz=True +actor_ppo_max_token_len=$(((max_prompt_length + max_response_length) * 2)) +infer_ppo_max_token_len=$(((max_prompt_length + max_response_length) * 3)) +ref_offload=True +actor_offload=False +gen_tp=1 +sp_size=1 +fsdp_size=2 + +# Fully async specific parameters +NNODES=${NNODES:-2} +NGPUS_PER_NODE=${NGPUS_PER_NODE:-8} + +n_gpus_rollout=6 +n_gpus_training=$((NGPUS_PER_NODE - n_gpus_rollout)) + +train_prompt_bsz=0 +gen_prompt_bsz=1 +n_resp_per_prompt=16 +train_prompt_mini_bsz=32 +total_rollout_steps=$(((512*100))) +test_freq=10 +staleness_threshold=1 +trigger_parameter_sync_step=64 +partial_rollout=True + +PYTHON_INTERPRETER="/home/hadoop-djst-algoplat/miniconda3/bin/python" +if [ ! -x "$PYTHON_INTERPRETER" ]; then + PYTHON_INTERPRETER="python3" +fi + +$PYTHON_INTERPRETER -m recipe.fully_async_policy.fully_async_main \ + data.train_files="${TRAIN_FILE}" \ + data.val_files="${TEST_FILE}" \ + data.prompt_key=prompt \ + data.truncation='left' \ + data.max_prompt_length=${max_prompt_length} \ + data.max_response_length=${max_response_length} \ + data.train_batch_size=${train_prompt_bsz} \ + data.gen_batch_size=${gen_prompt_bsz} \ + data.return_raw_chat=${return_raw_chat} \ + actor_rollout_ref.rollout.n=${n_resp_per_prompt} \ + algorithm.adv_estimator=${adv_estimator} \ + algorithm.use_kl_in_reward=${use_kl_in_reward} \ + algorithm.kl_ctrl.kl_coef=${kl_coef} \ + actor_rollout_ref.actor.strategy=fsdp2 \ + critic.strategy=fsdp2 \ + actor_rollout_ref.actor.use_kl_loss=${use_kl_loss} \ + actor_rollout_ref.actor.kl_loss_coef=${kl_loss_coef} \ + actor_rollout_ref.actor.clip_ratio_low=${clip_ratio_low} \ + actor_rollout_ref.actor.clip_ratio_high=${clip_ratio_high} \ + actor_rollout_ref.actor.clip_ratio_c=10.0 \ + actor_rollout_ref.model.use_remove_padding=True \ + actor_rollout_ref.hybrid_engine=False \ + +actor_rollout_ref.model.override_config.max_position_embeddings=32768 \ + actor_rollout_ref.actor.use_dynamic_bsz=${use_dynamic_bsz} \ + actor_rollout_ref.ref.log_prob_use_dynamic_bsz=${use_dynamic_bsz} \ + actor_rollout_ref.rollout.log_prob_use_dynamic_bsz=${use_dynamic_bsz} \ + actor_rollout_ref.actor.ppo_max_token_len_per_gpu=${actor_ppo_max_token_len} \ + actor_rollout_ref.ref.log_prob_max_token_len_per_gpu=${infer_ppo_max_token_len} \ + actor_rollout_ref.rollout.log_prob_max_token_len_per_gpu=${infer_ppo_max_token_len} \ + actor_rollout_ref.model.path="${MODEL_PATH}" \ + actor_rollout_ref.actor.optim.lr=1e-6 \ + actor_rollout_ref.actor.optim.lr_warmup_steps=10 \ + actor_rollout_ref.actor.optim.weight_decay=0.1 \ + actor_rollout_ref.actor.ppo_mini_batch_size=${train_prompt_mini_bsz} \ + actor_rollout_ref.actor.fsdp_config.param_offload=${actor_offload} \ + actor_rollout_ref.actor.fsdp_config.optimizer_offload=${actor_offload} \ + actor_rollout_ref.actor.entropy_coeff=0 \ + actor_rollout_ref.actor.grad_clip=1.0 \ + actor_rollout_ref.actor.loss_agg_mode=${loss_agg_mode} \ + actor_rollout_ref.actor.ulysses_sequence_parallel_size=${sp_size} \ + actor_rollout_ref.rollout.gpu_memory_utilization=0.80 \ + actor_rollout_ref.rollout.tensor_model_parallel_size=${gen_tp} \ + actor_rollout_ref.rollout.enable_chunked_prefill=True \ + actor_rollout_ref.rollout.max_num_batched_tokens=$((max_prompt_length + max_response_length)) \ + actor_rollout_ref.rollout.temperature=${temperature} \ + actor_rollout_ref.rollout.top_p=${top_p} \ + actor_rollout_ref.rollout.top_k=${top_k} \ + actor_rollout_ref.rollout.val_kwargs.temperature=${temperature} \ + actor_rollout_ref.rollout.val_kwargs.top_p=${val_top_p} \ + actor_rollout_ref.rollout.val_kwargs.top_k=${top_k} \ + actor_rollout_ref.rollout.val_kwargs.do_sample=True \ + actor_rollout_ref.rollout.val_kwargs.n=1 \ + actor_rollout_ref.ref.fsdp_config.param_offload=${ref_offload} \ + actor_rollout_ref.ref.ulysses_sequence_parallel_size=${sp_size} \ + actor_rollout_ref.actor.fsdp_config.fsdp_size=${fsdp_size} \ + actor_rollout_ref.rollout.name=${rollout_name} \ + actor_rollout_ref.rollout.mode=${rollout_mode} \ + actor_rollout_ref.rollout.calculate_log_probs=True \ + reward_model.reward_manager=dapo \ + +reward_model.reward_kwargs.overlong_buffer_cfg.enable=${enable_overlong_buffer} \ + +reward_model.reward_kwargs.overlong_buffer_cfg.len=${overlong_buffer_len} \ + +reward_model.reward_kwargs.overlong_buffer_cfg.penalty_factor=${overlong_penalty_factor} \ + +reward_model.reward_kwargs.overlong_buffer_cfg.log=False \ + +reward_model.reward_kwargs.max_resp_len=${max_response_length} \ + trainer.logger=['console','tensorboard'] \ + trainer.project_name="${project_name}" \ + trainer.experiment_name="${exp_name}" \ + trainer.val_before_train=True \ + trainer.save_freq=-1 \ + trainer.default_local_dir="${CKPTS_DIR}" \ + trainer.resume_mode=auto \ + trainer.nnodes="${NNODES}" \ + trainer.n_gpus_per_node="${n_gpus_training}" \ + rollout.nnodes="${NNODES}" \ + rollout.n_gpus_per_node="${n_gpus_rollout}" \ + rollout.total_rollout_steps="${total_rollout_steps}" \ + rollout.total_epochs=10 \ + rollout.test_freq="${test_freq}" \ + async_training.staleness_threshold="${staleness_threshold}" \ + async_training.trigger_parameter_sync_step="${trigger_parameter_sync_step}" \ + async_training.partial_rollout="${partial_rollout}" diff --git a/recipe/fully_async_policy/exp/qwen2-7B-math/fsdp2_fully-async_24-8/runtime_env.yaml b/recipe/fully_async_policy/exp/qwen2-7B-math/fsdp2_fully-async_24-8/runtime_env.yaml new file mode 100644 index 00000000000..dcca08e67f7 --- /dev/null +++ b/recipe/fully_async_policy/exp/qwen2-7B-math/fsdp2_fully-async_24-8/runtime_env.yaml @@ -0,0 +1,5 @@ +env_vars: + VLLM_USE_V1: "1" + TENSORBOARD_DIR: "/mnt/dolphinfs/hdd_pool/docker/user/hadoop-djst-algoplat/houzhenggang/tensorboard/dapo_7b_math_fsdp2_4_12" + NCCL_DEBUG: "INFO" + HYDRA_FULL_ERROR: "1" \ No newline at end of file diff --git a/recipe/fully_async_policy/exp/qwen2-7B-math/fsdp2_fully-async_8-24/dapo_qwen2-7B-math_28k_fsdp2_fully-async_8-24_mbs32_tpf_tfq20.sh b/recipe/fully_async_policy/exp/qwen2-7B-math/fsdp2_fully-async_8-24/dapo_qwen2-7B-math_28k_fsdp2_fully-async_8-24_mbs32_tpf_tfq20.sh new file mode 100644 index 00000000000..618497c0257 --- /dev/null +++ b/recipe/fully_async_policy/exp/qwen2-7B-math/fsdp2_fully-async_8-24/dapo_qwen2-7B-math_28k_fsdp2_fully-async_8-24_mbs32_tpf_tfq20.sh @@ -0,0 +1,174 @@ +#!/usr/bin/env bash +set -xeuo pipefail +# dapo_qwen2-7B-math_28k_fsdp2_fully-async_8-24_mbs32_tpf_tfq20 +project_name='DAPO' +exp_name='dapo_qwen2-7B-math_28k_fsdp2_fully-async_8-24_mbs32_tpf_tfq20' + +# Ray +# RAY_ADDRESS=${RAY_ADDRESS:-"http://localhost:8265"} +# WORKING_DIR=${WORKING_DIR:-"${PWD}"} +# RUNTIME_ENV=${RUNTIME_ENV:-"${WORKING_DIR}/verl/trainer/runtime_env.yaml"} +# Paths +RAY_DATA_HOME=${RAY_DATA_HOME:-"${HOME}/verl"} +# very important! please modify the max_position_embeddings in config.json to 32768 after downloading from huggingface +MODEL_PATH=${MODEL_PATH:-"${RAY_DATA_HOME}/models/Qwen2.5-Math-7B"} +CKPTS_DIR=${CKPTS_DIR:-"${RAY_DATA_HOME}/ckpts/${project_name}/${exp_name}"} +TRAIN_FILE=${TRAIN_FILE:-"${RAY_DATA_HOME}/data/dapo-math-17k.parquet"} +TEST_FILE=${TEST_FILE:-"${RAY_DATA_HOME}/data/aime-2024.parquet"} + +# MODEL_PATH=/mnt/dolphinfs/hdd_pool/docker/user/hadoop-djst-algoplat/houzhenggang/modelscope/hub/models/Qwen/Qwen2___5-Math-7B +MODEL_PATH=/mnt/dolphinfs/ssd_pool/docker/user/hadoop-friday-studio/FTI/houzhenggang/model/Qwen2___5-Math-7B +CKPTS_DIR=./ckpts/${project_name}/${exp_name} +# TRAIN_FILE=/mnt/dolphinfs/hdd_pool/docker/user/hadoop-djst-algoplat/houzhenggang/data/dapo/dapo-math-17k.parquet +# TEST_FILE=/mnt/dolphinfs/hdd_pool/docker/user/hadoop-djst-algoplat/houzhenggang/data/dapo/aime-2024.parquet +TRAIN_FILE=/mnt/dolphinfs/ssd_pool/docker/user/hadoop-friday-studio/FTI/houzhenggang/data/dapo/dapo-math-17k.parquet +TEST_FILE=/mnt/dolphinfs/ssd_pool/docker/user/hadoop-friday-studio/FTI/houzhenggang/data/dapo/aime-2024.parquet + +rollout_mode="async" +rollout_name="vllm" # sglang or vllm +if [ "$rollout_mode" = "async" ]; then + export VLLM_USE_V1=1 + return_raw_chat="True" +fi + +# Algorithm parameters +adv_estimator=grpo + +use_kl_in_reward=False +kl_coef=0.0 +use_kl_loss=False +kl_loss_coef=0.0 + +clip_ratio_low=0.2 +clip_ratio_high=0.28 + +# Response length parameters +max_prompt_length=$((1024 * 2)) +max_response_length=$((1024 *8)) +enable_overlong_buffer=True +overlong_buffer_len=$((1024 * 4)) +overlong_penalty_factor=1.0 + +# Training parameters +loss_agg_mode="token-mean" + +# Algorithm +temperature=1.0 +top_p=1.0 +top_k=-1 # 0 for HF rollout, -1 for vLLM rollout +val_top_p=0.7 + +# Performance Related Parameter +use_dynamic_bsz=True +actor_ppo_max_token_len=$(((max_prompt_length + max_response_length) * 2)) +infer_ppo_max_token_len=$(((max_prompt_length + max_response_length) * 3)) +ref_offload=True +actor_offload=False +gen_tp=1 +sp_size=1 +fsdp_size=2 + +# Fully async specific parameters +NNODES=${NNODES:-2} +NGPUS_PER_NODE=${NGPUS_PER_NODE:-8} + +n_gpus_rollout=6 +n_gpus_training=$((NGPUS_PER_NODE - n_gpus_rollout)) + +train_prompt_bsz=0 +gen_prompt_bsz=1 +n_resp_per_prompt=16 +train_prompt_mini_bsz=32 +total_rollout_steps=$(((512*100))) +test_freq=10 +staleness_threshold=1 +trigger_parameter_sync_step=64 +partial_rollout=True + +PYTHON_INTERPRETER="/home/hadoop-djst-algoplat/miniconda3/bin/python" +if [ ! -x "$PYTHON_INTERPRETER" ]; then + PYTHON_INTERPRETER="python3" +fi + +$PYTHON_INTERPRETER -m recipe.fully_async_policy.fully_async_main \ + data.train_files="${TRAIN_FILE}" \ + data.val_files="${TEST_FILE}" \ + data.prompt_key=prompt \ + data.truncation='left' \ + data.max_prompt_length=${max_prompt_length} \ + data.max_response_length=${max_response_length} \ + data.train_batch_size=${train_prompt_bsz} \ + data.gen_batch_size=${gen_prompt_bsz} \ + data.return_raw_chat=${return_raw_chat} \ + actor_rollout_ref.rollout.n=${n_resp_per_prompt} \ + algorithm.adv_estimator=${adv_estimator} \ + algorithm.use_kl_in_reward=${use_kl_in_reward} \ + algorithm.kl_ctrl.kl_coef=${kl_coef} \ + actor_rollout_ref.actor.strategy=fsdp2 \ + critic.strategy=fsdp2 \ + actor_rollout_ref.actor.use_kl_loss=${use_kl_loss} \ + actor_rollout_ref.actor.kl_loss_coef=${kl_loss_coef} \ + actor_rollout_ref.actor.clip_ratio_low=${clip_ratio_low} \ + actor_rollout_ref.actor.clip_ratio_high=${clip_ratio_high} \ + actor_rollout_ref.actor.clip_ratio_c=10.0 \ + actor_rollout_ref.model.use_remove_padding=True \ + actor_rollout_ref.hybrid_engine=False \ + +actor_rollout_ref.model.override_config.max_position_embeddings=32768 \ + actor_rollout_ref.actor.use_dynamic_bsz=${use_dynamic_bsz} \ + actor_rollout_ref.ref.log_prob_use_dynamic_bsz=${use_dynamic_bsz} \ + actor_rollout_ref.rollout.log_prob_use_dynamic_bsz=${use_dynamic_bsz} \ + actor_rollout_ref.actor.ppo_max_token_len_per_gpu=${actor_ppo_max_token_len} \ + actor_rollout_ref.ref.log_prob_max_token_len_per_gpu=${infer_ppo_max_token_len} \ + actor_rollout_ref.rollout.log_prob_max_token_len_per_gpu=${infer_ppo_max_token_len} \ + actor_rollout_ref.model.path="${MODEL_PATH}" \ + actor_rollout_ref.actor.optim.lr=1e-6 \ + actor_rollout_ref.actor.optim.lr_warmup_steps=10 \ + actor_rollout_ref.actor.optim.weight_decay=0.1 \ + actor_rollout_ref.actor.ppo_mini_batch_size=${train_prompt_mini_bsz} \ + actor_rollout_ref.actor.fsdp_config.param_offload=${actor_offload} \ + actor_rollout_ref.actor.fsdp_config.optimizer_offload=${actor_offload} \ + actor_rollout_ref.actor.entropy_coeff=0 \ + actor_rollout_ref.actor.grad_clip=1.0 \ + actor_rollout_ref.actor.loss_agg_mode=${loss_agg_mode} \ + actor_rollout_ref.actor.ulysses_sequence_parallel_size=${sp_size} \ + actor_rollout_ref.rollout.gpu_memory_utilization=0.80 \ + actor_rollout_ref.rollout.tensor_model_parallel_size=${gen_tp} \ + actor_rollout_ref.rollout.enable_chunked_prefill=True \ + actor_rollout_ref.rollout.max_num_batched_tokens=$((max_prompt_length + max_response_length)) \ + actor_rollout_ref.rollout.temperature=${temperature} \ + actor_rollout_ref.rollout.top_p=${top_p} \ + actor_rollout_ref.rollout.top_k=${top_k} \ + actor_rollout_ref.rollout.val_kwargs.temperature=${temperature} \ + actor_rollout_ref.rollout.val_kwargs.top_p=${val_top_p} \ + actor_rollout_ref.rollout.val_kwargs.top_k=${top_k} \ + actor_rollout_ref.rollout.val_kwargs.do_sample=True \ + actor_rollout_ref.rollout.val_kwargs.n=1 \ + actor_rollout_ref.ref.fsdp_config.param_offload=${ref_offload} \ + actor_rollout_ref.ref.ulysses_sequence_parallel_size=${sp_size} \ + actor_rollout_ref.actor.fsdp_config.fsdp_size=${fsdp_size} \ + actor_rollout_ref.rollout.name=${rollout_name} \ + actor_rollout_ref.rollout.mode=${rollout_mode} \ + actor_rollout_ref.rollout.calculate_log_probs=True \ + reward_model.reward_manager=dapo \ + +reward_model.reward_kwargs.overlong_buffer_cfg.enable=${enable_overlong_buffer} \ + +reward_model.reward_kwargs.overlong_buffer_cfg.len=${overlong_buffer_len} \ + +reward_model.reward_kwargs.overlong_buffer_cfg.penalty_factor=${overlong_penalty_factor} \ + +reward_model.reward_kwargs.overlong_buffer_cfg.log=False \ + +reward_model.reward_kwargs.max_resp_len=${max_response_length} \ + trainer.logger=['console','tensorboard'] \ + trainer.project_name="${project_name}" \ + trainer.experiment_name="${exp_name}" \ + trainer.val_before_train=True \ + trainer.save_freq=-1 \ + trainer.default_local_dir="${CKPTS_DIR}" \ + trainer.resume_mode=auto \ + trainer.nnodes="${NNODES}" \ + trainer.n_gpus_per_node="${n_gpus_training}" \ + rollout.nnodes="${NNODES}" \ + rollout.n_gpus_per_node="${n_gpus_rollout}" \ + rollout.total_rollout_steps="${total_rollout_steps}" \ + rollout.total_epochs=10 \ + rollout.test_freq="${test_freq}" \ + async_training.staleness_threshold="${staleness_threshold}" \ + async_training.trigger_parameter_sync_step="${trigger_parameter_sync_step}" \ + async_training.partial_rollout="${partial_rollout}" diff --git a/recipe/fully_async_policy/exp/qwen2-7B-math/fsdp2_fully-async_8-24/runtime_env.yaml b/recipe/fully_async_policy/exp/qwen2-7B-math/fsdp2_fully-async_8-24/runtime_env.yaml new file mode 100644 index 00000000000..dcca08e67f7 --- /dev/null +++ b/recipe/fully_async_policy/exp/qwen2-7B-math/fsdp2_fully-async_8-24/runtime_env.yaml @@ -0,0 +1,5 @@ +env_vars: + VLLM_USE_V1: "1" + TENSORBOARD_DIR: "/mnt/dolphinfs/hdd_pool/docker/user/hadoop-djst-algoplat/houzhenggang/tensorboard/dapo_7b_math_fsdp2_4_12" + NCCL_DEBUG: "INFO" + HYDRA_FULL_ERROR: "1" \ No newline at end of file From 9199f56726ae78266bf536d728495c91bb46ff44 Mon Sep 17 00:00:00 2001 From: wangshulin02 <953550366@qq.com> Date: Fri, 5 Sep 2025 21:16:01 +0800 Subject: [PATCH 102/182] exp shell files qwen3-32B_32 megatron colocate --- .../dapo_7b_math_fsdp2_colocate.sh | 6 +- .../fsdp2_colocate/runtime_env.yaml | 3 + .../dapo_7b_math_fsdp2_colocate.sh | 133 +++++++++++++ .../fsdp2_colocate_64/runtime_env.yaml | 3 + ..._fsdp2_fully-async_8-24_mbs32_tpf_tfq20.sh | 0 .../fsdp2_fully-async_16-16}/runtime_env.yaml | 0 ..._fsdp2_fully-async_8-24_mbs32_tpf_tfq20.sh | 0 .../fsdp2_fully-async_24-8}/runtime_env.yaml | 0 ..._fsdp2_fully-async_8-24_mbs32_tpf_tfq20.sh | 0 .../fsdp2_fully-async_8-24}/runtime_env.yaml | 0 .../dapo_7b_math_megatron_colocate.sh | 135 ++++++++++++++ .../megatron_colocate/runtime_env.yaml | 3 + .../dapo_7b_math_fsdp2_colocate.sh | 133 +++++++++++++ .../fsdp2_colocate/runtime_env.yaml | 3 + ..._fsdp2_fully-async_8-24_mbs32_tpf_tfq20.sh | 174 ++++++++++++++++++ .../fsdp2_fully-async_16-16}/runtime_env.yaml | 0 ..._fsdp2_fully-async_8-24_mbs32_tpf_tfq20.sh | 174 ++++++++++++++++++ .../fsdp2_fully-async_24-8/runtime_env.yaml | 5 + ..._fsdp2_fully-async_8-24_mbs32_tpf_tfq20.sh | 174 ++++++++++++++++++ .../fsdp2_fully-async_8-24/runtime_env.yaml | 5 + .../dapo_7b_math_megatron_colocate.sh | 135 ++++++++++++++ .../megatron_colocate/runtime_env.yaml | 3 + .../early_megatron_colocate.sh | 154 ++++++++++++++++ .../megatron_colocate/runtime_env.yaml | 5 + verl/trainer/ppo/ray_trainer.py | 6 +- 25 files changed, 1247 insertions(+), 7 deletions(-) rename recipe/fully_async_policy/exp/{qwen2-7B-math => qwen2-7B-math_32}/fsdp2_colocate/dapo_7b_math_fsdp2_colocate.sh (94%) create mode 100644 recipe/fully_async_policy/exp/qwen2-7B-math_32/fsdp2_colocate/runtime_env.yaml create mode 100644 recipe/fully_async_policy/exp/qwen2-7B-math_32/fsdp2_colocate_64/dapo_7b_math_fsdp2_colocate.sh create mode 100644 recipe/fully_async_policy/exp/qwen2-7B-math_32/fsdp2_colocate_64/runtime_env.yaml rename recipe/fully_async_policy/exp/{qwen2-7B-math => qwen2-7B-math_32}/fsdp2_fully-async_16-16/dapo_qwen2-7B-math_28k_fsdp2_fully-async_8-24_mbs32_tpf_tfq20.sh (100%) rename recipe/fully_async_policy/exp/{qwen2-7B-math/fsdp2_colocate => qwen2-7B-math_32/fsdp2_fully-async_16-16}/runtime_env.yaml (100%) rename recipe/fully_async_policy/exp/{qwen2-7B-math => qwen2-7B-math_32}/fsdp2_fully-async_24-8/dapo_qwen2-7B-math_28k_fsdp2_fully-async_8-24_mbs32_tpf_tfq20.sh (100%) rename recipe/fully_async_policy/exp/{qwen2-7B-math/fsdp2_fully-async_16-16 => qwen2-7B-math_32/fsdp2_fully-async_24-8}/runtime_env.yaml (100%) rename recipe/fully_async_policy/exp/{qwen2-7B-math => qwen2-7B-math_32}/fsdp2_fully-async_8-24/dapo_qwen2-7B-math_28k_fsdp2_fully-async_8-24_mbs32_tpf_tfq20.sh (100%) rename recipe/fully_async_policy/exp/{qwen2-7B-math/fsdp2_fully-async_24-8 => qwen2-7B-math_32/fsdp2_fully-async_8-24}/runtime_env.yaml (100%) create mode 100644 recipe/fully_async_policy/exp/qwen2-7B-math_32/megatron_colocate/dapo_7b_math_megatron_colocate.sh create mode 100644 recipe/fully_async_policy/exp/qwen2-7B-math_32/megatron_colocate/runtime_env.yaml create mode 100644 recipe/fully_async_policy/exp/qwen2-7B-math_64/fsdp2_colocate/dapo_7b_math_fsdp2_colocate.sh create mode 100644 recipe/fully_async_policy/exp/qwen2-7B-math_64/fsdp2_colocate/runtime_env.yaml create mode 100644 recipe/fully_async_policy/exp/qwen2-7B-math_64/fsdp2_fully-async_16-16/dapo_qwen2-7B-math_28k_fsdp2_fully-async_8-24_mbs32_tpf_tfq20.sh rename recipe/fully_async_policy/exp/{qwen2-7B-math/fsdp2_fully-async_8-24 => qwen2-7B-math_64/fsdp2_fully-async_16-16}/runtime_env.yaml (100%) create mode 100644 recipe/fully_async_policy/exp/qwen2-7B-math_64/fsdp2_fully-async_24-8/dapo_qwen2-7B-math_28k_fsdp2_fully-async_8-24_mbs32_tpf_tfq20.sh create mode 100644 recipe/fully_async_policy/exp/qwen2-7B-math_64/fsdp2_fully-async_24-8/runtime_env.yaml create mode 100644 recipe/fully_async_policy/exp/qwen2-7B-math_64/fsdp2_fully-async_8-24/dapo_qwen2-7B-math_28k_fsdp2_fully-async_8-24_mbs32_tpf_tfq20.sh create mode 100644 recipe/fully_async_policy/exp/qwen2-7B-math_64/fsdp2_fully-async_8-24/runtime_env.yaml create mode 100644 recipe/fully_async_policy/exp/qwen2-7B-math_64/megatron_colocate/dapo_7b_math_megatron_colocate.sh create mode 100644 recipe/fully_async_policy/exp/qwen2-7B-math_64/megatron_colocate/runtime_env.yaml create mode 100644 recipe/fully_async_policy/exp/qwen3-32B_32/megatron_colocate/early_megatron_colocate.sh create mode 100644 recipe/fully_async_policy/exp/qwen3-32B_32/megatron_colocate/runtime_env.yaml diff --git a/recipe/fully_async_policy/exp/qwen2-7B-math/fsdp2_colocate/dapo_7b_math_fsdp2_colocate.sh b/recipe/fully_async_policy/exp/qwen2-7B-math_32/fsdp2_colocate/dapo_7b_math_fsdp2_colocate.sh similarity index 94% rename from recipe/fully_async_policy/exp/qwen2-7B-math/fsdp2_colocate/dapo_7b_math_fsdp2_colocate.sh rename to recipe/fully_async_policy/exp/qwen2-7B-math_32/fsdp2_colocate/dapo_7b_math_fsdp2_colocate.sh index 8b627fd6eed..8d42dca04ca 100644 --- a/recipe/fully_async_policy/exp/qwen2-7B-math/fsdp2_colocate/dapo_7b_math_fsdp2_colocate.sh +++ b/recipe/fully_async_policy/exp/qwen2-7B-math_32/fsdp2_colocate/dapo_7b_math_fsdp2_colocate.sh @@ -2,7 +2,7 @@ set -xeuo pipefail project_name='DAPO' -exp_name='dapo_qwen2-7B-math_28k_fsdp2_colocate_32_mbs32_tfq20' +exp_name='dapo_qwen2-7B-math_28k_fsdp2_colocate_32_mbs32' adv_estimator=grpo @@ -35,10 +35,6 @@ NGPUS_PER_NODE=${NGPUS_PER_NODE:-8} # Paths RAY_DATA_HOME=${RAY_DATA_HOME:-"${HOME}/verl"} # very important! please modify the max_position_embeddings in config.json to 32768 after downloading from huggingface -MODEL_PATH=${MODEL_PATH:-"${RAY_DATA_HOME}/models/Qwen2.5-Math-7B"} -CKPTS_DIR=${CKPTS_DIR:-"${RAY_DATA_HOME}/ckpts/${project_name}/${exp_name}"} -TRAIN_FILE=${TRAIN_FILE:-"${RAY_DATA_HOME}/data/dapo-math-17k.parquet"} -TEST_FILE=${TEST_FILE:-"${RAY_DATA_HOME}/data/aime-2024.parquet"} MODEL_PATH=/mnt/dolphinfs/ssd_pool/docker/user/hadoop-friday-studio/FTI/houzhenggang/model/Qwen2___5-Math-7B CKPTS_DIR=./ckpts/${project_name}/${exp_name} diff --git a/recipe/fully_async_policy/exp/qwen2-7B-math_32/fsdp2_colocate/runtime_env.yaml b/recipe/fully_async_policy/exp/qwen2-7B-math_32/fsdp2_colocate/runtime_env.yaml new file mode 100644 index 00000000000..39c5a3593e8 --- /dev/null +++ b/recipe/fully_async_policy/exp/qwen2-7B-math_32/fsdp2_colocate/runtime_env.yaml @@ -0,0 +1,3 @@ +env_vars: + TENSORBOARD_DIR: "/mnt/dolphinfs/ssd_pool/docker/user/hadoop-friday-studio/FTI/houzhenggang/expeirments/qwen2-7B-math/dapo_qwen2-7B-math_28k_fsdp2_colocate_32_mbs32" + HYDRA_FULL_ERROR: "1" \ No newline at end of file diff --git a/recipe/fully_async_policy/exp/qwen2-7B-math_32/fsdp2_colocate_64/dapo_7b_math_fsdp2_colocate.sh b/recipe/fully_async_policy/exp/qwen2-7B-math_32/fsdp2_colocate_64/dapo_7b_math_fsdp2_colocate.sh new file mode 100644 index 00000000000..8d42dca04ca --- /dev/null +++ b/recipe/fully_async_policy/exp/qwen2-7B-math_32/fsdp2_colocate_64/dapo_7b_math_fsdp2_colocate.sh @@ -0,0 +1,133 @@ +#!/usr/bin/env bash +set -xeuo pipefail + +project_name='DAPO' +exp_name='dapo_qwen2-7B-math_28k_fsdp2_colocate_32_mbs32' + +adv_estimator=grpo + +use_kl_in_reward=False +kl_coef=0.0 +use_kl_loss=False +kl_loss_coef=0.0 + +clip_ratio_low=0.2 +clip_ratio_high=0.28 + +max_prompt_length=$((1024 * 2)) +max_response_length=$((1024 * 28)) +enable_overlong_buffer=True +overlong_buffer_len=$((1024 * 4)) +overlong_penalty_factor=1.0 + +loss_agg_mode="token-mean" + +train_prompt_bsz=512 +n_resp_per_prompt=16 +train_prompt_mini_bsz=32 + +# Ray +# RAY_ADDRESS=${RAY_ADDRESS:-"http://localhost:8265"} +# WORKING_DIR=${WORKING_DIR:-"${PWD}"} +# RUNTIME_ENV=${RUNTIME_ENV:-"${WORKING_DIR}/verl/trainer/runtime_env.yaml"} +NNODES=${NNODES:-4} +NGPUS_PER_NODE=${NGPUS_PER_NODE:-8} +# Paths +RAY_DATA_HOME=${RAY_DATA_HOME:-"${HOME}/verl"} +# very important! please modify the max_position_embeddings in config.json to 32768 after downloading from huggingface + +MODEL_PATH=/mnt/dolphinfs/ssd_pool/docker/user/hadoop-friday-studio/FTI/houzhenggang/model/Qwen2___5-Math-7B +CKPTS_DIR=./ckpts/${project_name}/${exp_name} +TRAIN_FILE=/mnt/dolphinfs/ssd_pool/docker/user/hadoop-friday-studio/FTI/houzhenggang/data/dapo/dapo-math-17k.parquet +TEST_FILE=/mnt/dolphinfs/ssd_pool/docker/user/hadoop-friday-studio/FTI/houzhenggang/data/dapo/aime-2024.parquet +# Algorithm +temperature=1.0 +top_p=1.0 +top_k=-1 # 0 for HF rollout, -1 for vLLM rollout +val_top_p=0.7 + +# Performance Related Parameter +use_dynamic_bsz=True +actor_ppo_max_token_len=$(((max_prompt_length + max_response_length) * 2)) +infer_ppo_max_token_len=$(((max_prompt_length + max_response_length) * 3)) +offload=True +gen_tp=4 +sp_size=4 +fsdp_size=2 + +# reference run wandb: https://wandb.ai/verl-org/DAPO%20Reproduction%20on%20verl/runs/ow47vvon?nw=nwusertongyuxuan361 + + +python -m verl.trainer.main_ppo \ + data.train_files="${TRAIN_FILE}" \ + data.val_files="${TEST_FILE}" \ + data.prompt_key=prompt \ + data.truncation='left' \ + data.max_prompt_length=${max_prompt_length} \ + data.max_response_length=${max_response_length} \ + data.train_batch_size=${train_prompt_bsz} \ + actor_rollout_ref.rollout.n=${n_resp_per_prompt} \ + algorithm.adv_estimator=${adv_estimator} \ + algorithm.use_kl_in_reward=${use_kl_in_reward} \ + algorithm.kl_ctrl.kl_coef=${kl_coef} \ + actor_rollout_ref.actor.strategy=fsdp2 \ + critic.strategy=fsdp2 \ + actor_rollout_ref.actor.use_kl_loss=${use_kl_loss} \ + actor_rollout_ref.actor.kl_loss_coef=${kl_loss_coef} \ + actor_rollout_ref.actor.clip_ratio_low=${clip_ratio_low} \ + actor_rollout_ref.actor.clip_ratio_high=${clip_ratio_high} \ + actor_rollout_ref.actor.clip_ratio_c=10.0 \ + actor_rollout_ref.model.use_remove_padding=True \ + +actor_rollout_ref.model.override_config.max_position_embeddings=32768 \ + actor_rollout_ref.actor.use_dynamic_bsz=${use_dynamic_bsz} \ + actor_rollout_ref.ref.log_prob_use_dynamic_bsz=${use_dynamic_bsz} \ + actor_rollout_ref.rollout.log_prob_use_dynamic_bsz=${use_dynamic_bsz} \ + actor_rollout_ref.actor.ppo_max_token_len_per_gpu=${actor_ppo_max_token_len} \ + actor_rollout_ref.ref.log_prob_max_token_len_per_gpu=${infer_ppo_max_token_len} \ + actor_rollout_ref.rollout.log_prob_max_token_len_per_gpu=${infer_ppo_max_token_len} \ + actor_rollout_ref.model.path="${MODEL_PATH}" \ + actor_rollout_ref.model.enable_gradient_checkpointing=True \ + actor_rollout_ref.actor.optim.lr=1e-6 \ + actor_rollout_ref.actor.optim.lr_warmup_steps=10 \ + actor_rollout_ref.actor.optim.weight_decay=0.1 \ + actor_rollout_ref.actor.ppo_mini_batch_size=${train_prompt_mini_bsz} \ + actor_rollout_ref.actor.fsdp_config.param_offload=${offload} \ + actor_rollout_ref.actor.fsdp_config.optimizer_offload=${offload} \ + actor_rollout_ref.actor.entropy_coeff=0 \ + actor_rollout_ref.actor.grad_clip=1.0 \ + actor_rollout_ref.actor.loss_agg_mode=${loss_agg_mode} \ + actor_rollout_ref.actor.ulysses_sequence_parallel_size=${sp_size} \ + actor_rollout_ref.rollout.gpu_memory_utilization=0.80 \ + actor_rollout_ref.rollout.tensor_model_parallel_size=${gen_tp} \ + actor_rollout_ref.rollout.enable_chunked_prefill=True \ + actor_rollout_ref.rollout.max_num_batched_tokens=$((max_prompt_length + max_response_length)) \ + actor_rollout_ref.rollout.temperature=${temperature} \ + actor_rollout_ref.rollout.top_p=${top_p} \ + actor_rollout_ref.rollout.top_k=${top_k} \ + actor_rollout_ref.rollout.val_kwargs.temperature=${temperature} \ + actor_rollout_ref.rollout.val_kwargs.top_p=${val_top_p} \ + actor_rollout_ref.rollout.val_kwargs.top_k=${top_k} \ + actor_rollout_ref.rollout.val_kwargs.do_sample=True \ + actor_rollout_ref.rollout.val_kwargs.n=1 \ + actor_rollout_ref.ref.fsdp_config.param_offload=${offload} \ + actor_rollout_ref.ref.ulysses_sequence_parallel_size=${sp_size} \ + actor_rollout_ref.actor.fsdp_config.fsdp_size=${fsdp_size} \ + reward_model.reward_manager=dapo \ + +reward_model.reward_kwargs.overlong_buffer_cfg.enable=${enable_overlong_buffer} \ + +reward_model.reward_kwargs.overlong_buffer_cfg.len=${overlong_buffer_len} \ + +reward_model.reward_kwargs.overlong_buffer_cfg.penalty_factor=${overlong_penalty_factor} \ + +reward_model.reward_kwargs.overlong_buffer_cfg.log=False \ + +reward_model.reward_kwargs.max_resp_len=${max_response_length} \ + trainer.logger=['console','tensorboard'] \ + trainer.project_name="${project_name}" \ + trainer.experiment_name="${exp_name}" \ + trainer.n_gpus_per_node="${NGPUS_PER_NODE}" \ + trainer.nnodes="${NNODES}" \ + trainer.val_before_train=True \ + trainer.test_freq=20 \ + trainer.save_freq=-1 \ + trainer.total_epochs=10 \ + trainer.total_training_steps=400 \ + trainer.default_local_dir="${CKPTS_DIR}" \ + trainer.resume_mode=auto \ + trainer.log_val_generations=10 diff --git a/recipe/fully_async_policy/exp/qwen2-7B-math_32/fsdp2_colocate_64/runtime_env.yaml b/recipe/fully_async_policy/exp/qwen2-7B-math_32/fsdp2_colocate_64/runtime_env.yaml new file mode 100644 index 00000000000..39c5a3593e8 --- /dev/null +++ b/recipe/fully_async_policy/exp/qwen2-7B-math_32/fsdp2_colocate_64/runtime_env.yaml @@ -0,0 +1,3 @@ +env_vars: + TENSORBOARD_DIR: "/mnt/dolphinfs/ssd_pool/docker/user/hadoop-friday-studio/FTI/houzhenggang/expeirments/qwen2-7B-math/dapo_qwen2-7B-math_28k_fsdp2_colocate_32_mbs32" + HYDRA_FULL_ERROR: "1" \ No newline at end of file diff --git a/recipe/fully_async_policy/exp/qwen2-7B-math/fsdp2_fully-async_16-16/dapo_qwen2-7B-math_28k_fsdp2_fully-async_8-24_mbs32_tpf_tfq20.sh b/recipe/fully_async_policy/exp/qwen2-7B-math_32/fsdp2_fully-async_16-16/dapo_qwen2-7B-math_28k_fsdp2_fully-async_8-24_mbs32_tpf_tfq20.sh similarity index 100% rename from recipe/fully_async_policy/exp/qwen2-7B-math/fsdp2_fully-async_16-16/dapo_qwen2-7B-math_28k_fsdp2_fully-async_8-24_mbs32_tpf_tfq20.sh rename to recipe/fully_async_policy/exp/qwen2-7B-math_32/fsdp2_fully-async_16-16/dapo_qwen2-7B-math_28k_fsdp2_fully-async_8-24_mbs32_tpf_tfq20.sh diff --git a/recipe/fully_async_policy/exp/qwen2-7B-math/fsdp2_colocate/runtime_env.yaml b/recipe/fully_async_policy/exp/qwen2-7B-math_32/fsdp2_fully-async_16-16/runtime_env.yaml similarity index 100% rename from recipe/fully_async_policy/exp/qwen2-7B-math/fsdp2_colocate/runtime_env.yaml rename to recipe/fully_async_policy/exp/qwen2-7B-math_32/fsdp2_fully-async_16-16/runtime_env.yaml diff --git a/recipe/fully_async_policy/exp/qwen2-7B-math/fsdp2_fully-async_24-8/dapo_qwen2-7B-math_28k_fsdp2_fully-async_8-24_mbs32_tpf_tfq20.sh b/recipe/fully_async_policy/exp/qwen2-7B-math_32/fsdp2_fully-async_24-8/dapo_qwen2-7B-math_28k_fsdp2_fully-async_8-24_mbs32_tpf_tfq20.sh similarity index 100% rename from recipe/fully_async_policy/exp/qwen2-7B-math/fsdp2_fully-async_24-8/dapo_qwen2-7B-math_28k_fsdp2_fully-async_8-24_mbs32_tpf_tfq20.sh rename to recipe/fully_async_policy/exp/qwen2-7B-math_32/fsdp2_fully-async_24-8/dapo_qwen2-7B-math_28k_fsdp2_fully-async_8-24_mbs32_tpf_tfq20.sh diff --git a/recipe/fully_async_policy/exp/qwen2-7B-math/fsdp2_fully-async_16-16/runtime_env.yaml b/recipe/fully_async_policy/exp/qwen2-7B-math_32/fsdp2_fully-async_24-8/runtime_env.yaml similarity index 100% rename from recipe/fully_async_policy/exp/qwen2-7B-math/fsdp2_fully-async_16-16/runtime_env.yaml rename to recipe/fully_async_policy/exp/qwen2-7B-math_32/fsdp2_fully-async_24-8/runtime_env.yaml diff --git a/recipe/fully_async_policy/exp/qwen2-7B-math/fsdp2_fully-async_8-24/dapo_qwen2-7B-math_28k_fsdp2_fully-async_8-24_mbs32_tpf_tfq20.sh b/recipe/fully_async_policy/exp/qwen2-7B-math_32/fsdp2_fully-async_8-24/dapo_qwen2-7B-math_28k_fsdp2_fully-async_8-24_mbs32_tpf_tfq20.sh similarity index 100% rename from recipe/fully_async_policy/exp/qwen2-7B-math/fsdp2_fully-async_8-24/dapo_qwen2-7B-math_28k_fsdp2_fully-async_8-24_mbs32_tpf_tfq20.sh rename to recipe/fully_async_policy/exp/qwen2-7B-math_32/fsdp2_fully-async_8-24/dapo_qwen2-7B-math_28k_fsdp2_fully-async_8-24_mbs32_tpf_tfq20.sh diff --git a/recipe/fully_async_policy/exp/qwen2-7B-math/fsdp2_fully-async_24-8/runtime_env.yaml b/recipe/fully_async_policy/exp/qwen2-7B-math_32/fsdp2_fully-async_8-24/runtime_env.yaml similarity index 100% rename from recipe/fully_async_policy/exp/qwen2-7B-math/fsdp2_fully-async_24-8/runtime_env.yaml rename to recipe/fully_async_policy/exp/qwen2-7B-math_32/fsdp2_fully-async_8-24/runtime_env.yaml diff --git a/recipe/fully_async_policy/exp/qwen2-7B-math_32/megatron_colocate/dapo_7b_math_megatron_colocate.sh b/recipe/fully_async_policy/exp/qwen2-7B-math_32/megatron_colocate/dapo_7b_math_megatron_colocate.sh new file mode 100644 index 00000000000..8bf1af32da8 --- /dev/null +++ b/recipe/fully_async_policy/exp/qwen2-7B-math_32/megatron_colocate/dapo_7b_math_megatron_colocate.sh @@ -0,0 +1,135 @@ +#!/usr/bin/env bash +set -xeuo pipefail + +project_name='DAPO' +exp_name='dapo_qwen2-7B-math_28k_megatron_colocate_32_mbs32' + +adv_estimator=grpo + +use_kl_in_reward=False +kl_coef=0.0 +use_kl_loss=False +kl_loss_coef=0.0 + +clip_ratio_low=0.2 +clip_ratio_high=0.28 + +max_prompt_length=$((1024 * 2)) +max_response_length=$((1024 * 28)) +enable_overlong_buffer=True +overlong_buffer_len=$((1024 * 4)) +overlong_penalty_factor=1.0 + +loss_agg_mode="token-mean" + +train_prompt_bsz=512 +n_resp_per_prompt=16 +train_prompt_mini_bsz=32 + +# Ray +# RAY_ADDRESS=${RAY_ADDRESS:-"http://localhost:8265"} +# WORKING_DIR=${WORKING_DIR:-"${PWD}"} +# RUNTIME_ENV=${RUNTIME_ENV:-"${WORKING_DIR}/verl/trainer/runtime_env.yaml"} +NNODES=${NNODES:-4} +NGPUS_PER_NODE=${NGPUS_PER_NODE:-8} +# Paths +MODEL_PATH=/mnt/dolphinfs/ssd_pool/docker/user/hadoop-friday-studio/FTI/houzhenggang/model/Qwen2___5-Math-7B +CKPTS_DIR=./ckpts/${project_name}/${exp_name} +TRAIN_FILE=/mnt/dolphinfs/ssd_pool/docker/user/hadoop-friday-studio/FTI/houzhenggang/data/dapo/dapo-math-17k.parquet +TEST_FILE=/mnt/dolphinfs/ssd_pool/docker/user/hadoop-friday-studio/FTI/houzhenggang/data/dapo/aime-2024.parquet + +# Algorithm +temperature=1.0 +top_p=1.0 +top_k=-1 # 0 for HF rollout, -1 for vLLM rollout +val_top_p=0.7 + +# Performance Related Parameter +use_dynamic_bsz=True +actor_ppo_max_token_len=$(((max_prompt_length + max_response_length) * 2)) +infer_ppo_max_token_len=$(((max_prompt_length + max_response_length) * 3)) +offload=True +gen_tp=4 +train_tp=2 +train_pp=1 + +# TODO: support dynamic_bsz for megatron +# actor_rollout_ref.actor.use_dynamic_bsz=${use_dynamic_bsz} \ +# actor_rollout_ref.ref.log_prob_use_dynamic_bsz=${use_dynamic_bsz} \ +# actor_rollout_ref.rollout.log_prob_use_dynamic_bsz=${use_dynamic_bsz} \ +# actor_rollout_ref.actor.ppo_max_token_len_per_gpu=${actor_ppo_max_token_len} \ +# actor_rollout_ref.ref.log_prob_max_token_len_per_gpu=${infer_ppo_max_token_len} \ +# actor_rollout_ref.rollout.log_prob_max_token_len_per_gpu=${infer_ppo_max_token_len} \ + +python3 -m verl.trainer.main_ppo \ + --config-path=config \ + --config-name='ppo_megatron_trainer.yaml' \ + data.train_files="${TRAIN_FILE}" \ + data.val_files="${TEST_FILE}" \ + data.prompt_key=prompt \ + data.truncation='left' \ + data.max_prompt_length=${max_prompt_length} \ + data.max_response_length=${max_response_length} \ + data.train_batch_size=${train_prompt_bsz} \ + actor_rollout_ref.rollout.n=${n_resp_per_prompt} \ + algorithm.adv_estimator=${adv_estimator} \ + algorithm.use_kl_in_reward=${use_kl_in_reward} \ + algorithm.kl_ctrl.kl_coef=${kl_coef} \ + actor_rollout_ref.actor.strategy=megatron \ + critic.strategy=megatron \ + actor_rollout_ref.actor.use_kl_loss=${use_kl_loss} \ + actor_rollout_ref.actor.kl_loss_coef=${kl_loss_coef} \ + actor_rollout_ref.actor.clip_ratio_low=${clip_ratio_low} \ + actor_rollout_ref.actor.clip_ratio_high=${clip_ratio_high} \ + actor_rollout_ref.actor.clip_ratio_c=10.0 \ + +actor_rollout_ref.model.override_config.max_position_embeddings=32768 \ + actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=2 \ + actor_rollout_ref.ref.log_prob_micro_batch_size_per_gpu=4 \ + actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu=4 \ + actor_rollout_ref.model.path="${MODEL_PATH}" \ + actor_rollout_ref.actor.optim.lr=1e-6 \ + actor_rollout_ref.actor.optim.lr_warmup_steps=10 \ + actor_rollout_ref.actor.optim.weight_decay=0.1 \ + actor_rollout_ref.actor.ppo_mini_batch_size=${train_prompt_mini_bsz} \ + actor_rollout_ref.actor.megatron.param_offload=${offload} \ + actor_rollout_ref.actor.megatron.optimizer_offload=${offload} \ + actor_rollout_ref.actor.megatron.grad_offload=${offload} \ + actor_rollout_ref.actor.megatron.pipeline_model_parallel_size=${train_pp} \ + actor_rollout_ref.actor.megatron.tensor_model_parallel_size=${train_tp} \ + actor_rollout_ref.actor.entropy_coeff=0 \ + actor_rollout_ref.actor.optim.clip_grad=1.0 \ + actor_rollout_ref.actor.loss_agg_mode=${loss_agg_mode} \ + actor_rollout_ref.rollout.gpu_memory_utilization=0.80 \ + actor_rollout_ref.rollout.tensor_model_parallel_size=${gen_tp} \ + actor_rollout_ref.rollout.enable_chunked_prefill=True \ + actor_rollout_ref.rollout.max_num_batched_tokens=$((max_prompt_length + max_response_length)) \ + actor_rollout_ref.rollout.temperature=${temperature} \ + actor_rollout_ref.rollout.top_p=${top_p} \ + actor_rollout_ref.rollout.top_k=${top_k} \ + actor_rollout_ref.rollout.val_kwargs.temperature=${temperature} \ + actor_rollout_ref.rollout.val_kwargs.top_p=${val_top_p} \ + actor_rollout_ref.rollout.val_kwargs.top_k=${top_k} \ + actor_rollout_ref.rollout.val_kwargs.do_sample=True \ + actor_rollout_ref.rollout.val_kwargs.n=1 \ + actor_rollout_ref.ref.megatron.pipeline_model_parallel_size=${train_pp} \ + actor_rollout_ref.ref.megatron.tensor_model_parallel_size=${train_tp} \ + actor_rollout_ref.ref.megatron.param_offload=${offload} \ + reward_model.reward_manager=dapo \ + +reward_model.reward_kwargs.overlong_buffer_cfg.enable=${enable_overlong_buffer} \ + +reward_model.reward_kwargs.overlong_buffer_cfg.len=${overlong_buffer_len} \ + +reward_model.reward_kwargs.overlong_buffer_cfg.penalty_factor=${overlong_penalty_factor} \ + +reward_model.reward_kwargs.overlong_buffer_cfg.log=False \ + +reward_model.reward_kwargs.max_resp_len=${max_response_length} \ + trainer.logger=['console','tensorboard'] \ + trainer.project_name="${project_name}" \ + trainer.experiment_name="${exp_name}" \ + trainer.n_gpus_per_node=8 \ + trainer.nnodes="${NNODES}" \ + trainer.val_before_train=True \ + trainer.test_freq=20 \ + trainer.save_freq=-1 \ + trainer.total_epochs=10 \ + trainer.total_training_steps=400 \ + trainer.default_local_dir="${CKPTS_DIR}" \ + trainer.resume_mode=auto \ + trainer.log_val_generations=10 diff --git a/recipe/fully_async_policy/exp/qwen2-7B-math_32/megatron_colocate/runtime_env.yaml b/recipe/fully_async_policy/exp/qwen2-7B-math_32/megatron_colocate/runtime_env.yaml new file mode 100644 index 00000000000..3a35b4a52ad --- /dev/null +++ b/recipe/fully_async_policy/exp/qwen2-7B-math_32/megatron_colocate/runtime_env.yaml @@ -0,0 +1,3 @@ +env_vars: + TENSORBOARD_DIR: "/mnt/dolphinfs/ssd_pool/docker/user/hadoop-friday-studio/FTI/houzhenggang/expeirments/qwen2-7B-math/dapo_qwen2-7B-math_28k_megatron_colocate_32_mbs32" + HYDRA_FULL_ERROR: "1" \ No newline at end of file diff --git a/recipe/fully_async_policy/exp/qwen2-7B-math_64/fsdp2_colocate/dapo_7b_math_fsdp2_colocate.sh b/recipe/fully_async_policy/exp/qwen2-7B-math_64/fsdp2_colocate/dapo_7b_math_fsdp2_colocate.sh new file mode 100644 index 00000000000..e6ab551869d --- /dev/null +++ b/recipe/fully_async_policy/exp/qwen2-7B-math_64/fsdp2_colocate/dapo_7b_math_fsdp2_colocate.sh @@ -0,0 +1,133 @@ +#!/usr/bin/env bash +set -xeuo pipefail + +project_name='DAPO' +exp_name='dapo_qwen2-7B-math_28k_fsdp2_colocate_64_mbs32' + +adv_estimator=grpo + +use_kl_in_reward=False +kl_coef=0.0 +use_kl_loss=False +kl_loss_coef=0.0 + +clip_ratio_low=0.2 +clip_ratio_high=0.28 + +max_prompt_length=$((1024 * 2)) +max_response_length=$((1024 * 28)) +enable_overlong_buffer=True +overlong_buffer_len=$((1024 * 4)) +overlong_penalty_factor=1.0 + +loss_agg_mode="token-mean" + +train_prompt_bsz=512 +n_resp_per_prompt=16 +train_prompt_mini_bsz=32 + +# Ray +# RAY_ADDRESS=${RAY_ADDRESS:-"http://localhost:8265"} +# WORKING_DIR=${WORKING_DIR:-"${PWD}"} +# RUNTIME_ENV=${RUNTIME_ENV:-"${WORKING_DIR}/verl/trainer/runtime_env.yaml"} +NNODES=${NNODES:-8} +NGPUS_PER_NODE=${NGPUS_PER_NODE:-8} +# Paths +RAY_DATA_HOME=${RAY_DATA_HOME:-"${HOME}/verl"} +# very important! please modify the max_position_embeddings in config.json to 32768 after downloading from huggingface + +MODEL_PATH=/mnt/dolphinfs/ssd_pool/docker/user/hadoop-friday-studio/FTI/houzhenggang/model/Qwen2___5-Math-7B +CKPTS_DIR=./ckpts/${project_name}/${exp_name} +TRAIN_FILE=/mnt/dolphinfs/ssd_pool/docker/user/hadoop-friday-studio/FTI/houzhenggang/data/dapo/dapo-math-17k.parquet +TEST_FILE=/mnt/dolphinfs/ssd_pool/docker/user/hadoop-friday-studio/FTI/houzhenggang/data/dapo/aime-2024.parquet +# Algorithm +temperature=1.0 +top_p=1.0 +top_k=-1 # 0 for HF rollout, -1 for vLLM rollout +val_top_p=0.7 + +# Performance Related Parameter +use_dynamic_bsz=True +actor_ppo_max_token_len=$(((max_prompt_length + max_response_length) * 2)) +infer_ppo_max_token_len=$(((max_prompt_length + max_response_length) * 3)) +offload=True +gen_tp=4 +sp_size=4 +fsdp_size=2 + +# reference run wandb: https://wandb.ai/verl-org/DAPO%20Reproduction%20on%20verl/runs/ow47vvon?nw=nwusertongyuxuan361 + + +python -m verl.trainer.main_ppo \ + data.train_files="${TRAIN_FILE}" \ + data.val_files="${TEST_FILE}" \ + data.prompt_key=prompt \ + data.truncation='left' \ + data.max_prompt_length=${max_prompt_length} \ + data.max_response_length=${max_response_length} \ + data.train_batch_size=${train_prompt_bsz} \ + actor_rollout_ref.rollout.n=${n_resp_per_prompt} \ + algorithm.adv_estimator=${adv_estimator} \ + algorithm.use_kl_in_reward=${use_kl_in_reward} \ + algorithm.kl_ctrl.kl_coef=${kl_coef} \ + actor_rollout_ref.actor.strategy=fsdp2 \ + critic.strategy=fsdp2 \ + actor_rollout_ref.actor.use_kl_loss=${use_kl_loss} \ + actor_rollout_ref.actor.kl_loss_coef=${kl_loss_coef} \ + actor_rollout_ref.actor.clip_ratio_low=${clip_ratio_low} \ + actor_rollout_ref.actor.clip_ratio_high=${clip_ratio_high} \ + actor_rollout_ref.actor.clip_ratio_c=10.0 \ + actor_rollout_ref.model.use_remove_padding=True \ + +actor_rollout_ref.model.override_config.max_position_embeddings=32768 \ + actor_rollout_ref.actor.use_dynamic_bsz=${use_dynamic_bsz} \ + actor_rollout_ref.ref.log_prob_use_dynamic_bsz=${use_dynamic_bsz} \ + actor_rollout_ref.rollout.log_prob_use_dynamic_bsz=${use_dynamic_bsz} \ + actor_rollout_ref.actor.ppo_max_token_len_per_gpu=${actor_ppo_max_token_len} \ + actor_rollout_ref.ref.log_prob_max_token_len_per_gpu=${infer_ppo_max_token_len} \ + actor_rollout_ref.rollout.log_prob_max_token_len_per_gpu=${infer_ppo_max_token_len} \ + actor_rollout_ref.model.path="${MODEL_PATH}" \ + actor_rollout_ref.model.enable_gradient_checkpointing=True \ + actor_rollout_ref.actor.optim.lr=1e-6 \ + actor_rollout_ref.actor.optim.lr_warmup_steps=10 \ + actor_rollout_ref.actor.optim.weight_decay=0.1 \ + actor_rollout_ref.actor.ppo_mini_batch_size=${train_prompt_mini_bsz} \ + actor_rollout_ref.actor.fsdp_config.param_offload=${offload} \ + actor_rollout_ref.actor.fsdp_config.optimizer_offload=${offload} \ + actor_rollout_ref.actor.entropy_coeff=0 \ + actor_rollout_ref.actor.grad_clip=1.0 \ + actor_rollout_ref.actor.loss_agg_mode=${loss_agg_mode} \ + actor_rollout_ref.actor.ulysses_sequence_parallel_size=${sp_size} \ + actor_rollout_ref.rollout.gpu_memory_utilization=0.80 \ + actor_rollout_ref.rollout.tensor_model_parallel_size=${gen_tp} \ + actor_rollout_ref.rollout.enable_chunked_prefill=True \ + actor_rollout_ref.rollout.max_num_batched_tokens=$((max_prompt_length + max_response_length)) \ + actor_rollout_ref.rollout.temperature=${temperature} \ + actor_rollout_ref.rollout.top_p=${top_p} \ + actor_rollout_ref.rollout.top_k=${top_k} \ + actor_rollout_ref.rollout.val_kwargs.temperature=${temperature} \ + actor_rollout_ref.rollout.val_kwargs.top_p=${val_top_p} \ + actor_rollout_ref.rollout.val_kwargs.top_k=${top_k} \ + actor_rollout_ref.rollout.val_kwargs.do_sample=True \ + actor_rollout_ref.rollout.val_kwargs.n=1 \ + actor_rollout_ref.ref.fsdp_config.param_offload=${offload} \ + actor_rollout_ref.ref.ulysses_sequence_parallel_size=${sp_size} \ + actor_rollout_ref.actor.fsdp_config.fsdp_size=${fsdp_size} \ + reward_model.reward_manager=dapo \ + +reward_model.reward_kwargs.overlong_buffer_cfg.enable=${enable_overlong_buffer} \ + +reward_model.reward_kwargs.overlong_buffer_cfg.len=${overlong_buffer_len} \ + +reward_model.reward_kwargs.overlong_buffer_cfg.penalty_factor=${overlong_penalty_factor} \ + +reward_model.reward_kwargs.overlong_buffer_cfg.log=False \ + +reward_model.reward_kwargs.max_resp_len=${max_response_length} \ + trainer.logger=['console','tensorboard'] \ + trainer.project_name="${project_name}" \ + trainer.experiment_name="${exp_name}" \ + trainer.n_gpus_per_node="${NGPUS_PER_NODE}" \ + trainer.nnodes="${NNODES}" \ + trainer.val_before_train=True \ + trainer.test_freq=20 \ + trainer.save_freq=-1 \ + trainer.total_epochs=10 \ + trainer.total_training_steps=400 \ + trainer.default_local_dir="${CKPTS_DIR}" \ + trainer.resume_mode=auto \ + trainer.log_val_generations=10 diff --git a/recipe/fully_async_policy/exp/qwen2-7B-math_64/fsdp2_colocate/runtime_env.yaml b/recipe/fully_async_policy/exp/qwen2-7B-math_64/fsdp2_colocate/runtime_env.yaml new file mode 100644 index 00000000000..514ab9a73f0 --- /dev/null +++ b/recipe/fully_async_policy/exp/qwen2-7B-math_64/fsdp2_colocate/runtime_env.yaml @@ -0,0 +1,3 @@ +env_vars: + TENSORBOARD_DIR: "/mnt/dolphinfs/ssd_pool/docker/user/hadoop-friday-studio/FTI/houzhenggang/expeirments/qwen2-7B-math-64/dapo_qwen2-7B-math_28k_fsdp2_colocate_64_mbs32" + HYDRA_FULL_ERROR: "1" \ No newline at end of file diff --git a/recipe/fully_async_policy/exp/qwen2-7B-math_64/fsdp2_fully-async_16-16/dapo_qwen2-7B-math_28k_fsdp2_fully-async_8-24_mbs32_tpf_tfq20.sh b/recipe/fully_async_policy/exp/qwen2-7B-math_64/fsdp2_fully-async_16-16/dapo_qwen2-7B-math_28k_fsdp2_fully-async_8-24_mbs32_tpf_tfq20.sh new file mode 100644 index 00000000000..618497c0257 --- /dev/null +++ b/recipe/fully_async_policy/exp/qwen2-7B-math_64/fsdp2_fully-async_16-16/dapo_qwen2-7B-math_28k_fsdp2_fully-async_8-24_mbs32_tpf_tfq20.sh @@ -0,0 +1,174 @@ +#!/usr/bin/env bash +set -xeuo pipefail +# dapo_qwen2-7B-math_28k_fsdp2_fully-async_8-24_mbs32_tpf_tfq20 +project_name='DAPO' +exp_name='dapo_qwen2-7B-math_28k_fsdp2_fully-async_8-24_mbs32_tpf_tfq20' + +# Ray +# RAY_ADDRESS=${RAY_ADDRESS:-"http://localhost:8265"} +# WORKING_DIR=${WORKING_DIR:-"${PWD}"} +# RUNTIME_ENV=${RUNTIME_ENV:-"${WORKING_DIR}/verl/trainer/runtime_env.yaml"} +# Paths +RAY_DATA_HOME=${RAY_DATA_HOME:-"${HOME}/verl"} +# very important! please modify the max_position_embeddings in config.json to 32768 after downloading from huggingface +MODEL_PATH=${MODEL_PATH:-"${RAY_DATA_HOME}/models/Qwen2.5-Math-7B"} +CKPTS_DIR=${CKPTS_DIR:-"${RAY_DATA_HOME}/ckpts/${project_name}/${exp_name}"} +TRAIN_FILE=${TRAIN_FILE:-"${RAY_DATA_HOME}/data/dapo-math-17k.parquet"} +TEST_FILE=${TEST_FILE:-"${RAY_DATA_HOME}/data/aime-2024.parquet"} + +# MODEL_PATH=/mnt/dolphinfs/hdd_pool/docker/user/hadoop-djst-algoplat/houzhenggang/modelscope/hub/models/Qwen/Qwen2___5-Math-7B +MODEL_PATH=/mnt/dolphinfs/ssd_pool/docker/user/hadoop-friday-studio/FTI/houzhenggang/model/Qwen2___5-Math-7B +CKPTS_DIR=./ckpts/${project_name}/${exp_name} +# TRAIN_FILE=/mnt/dolphinfs/hdd_pool/docker/user/hadoop-djst-algoplat/houzhenggang/data/dapo/dapo-math-17k.parquet +# TEST_FILE=/mnt/dolphinfs/hdd_pool/docker/user/hadoop-djst-algoplat/houzhenggang/data/dapo/aime-2024.parquet +TRAIN_FILE=/mnt/dolphinfs/ssd_pool/docker/user/hadoop-friday-studio/FTI/houzhenggang/data/dapo/dapo-math-17k.parquet +TEST_FILE=/mnt/dolphinfs/ssd_pool/docker/user/hadoop-friday-studio/FTI/houzhenggang/data/dapo/aime-2024.parquet + +rollout_mode="async" +rollout_name="vllm" # sglang or vllm +if [ "$rollout_mode" = "async" ]; then + export VLLM_USE_V1=1 + return_raw_chat="True" +fi + +# Algorithm parameters +adv_estimator=grpo + +use_kl_in_reward=False +kl_coef=0.0 +use_kl_loss=False +kl_loss_coef=0.0 + +clip_ratio_low=0.2 +clip_ratio_high=0.28 + +# Response length parameters +max_prompt_length=$((1024 * 2)) +max_response_length=$((1024 *8)) +enable_overlong_buffer=True +overlong_buffer_len=$((1024 * 4)) +overlong_penalty_factor=1.0 + +# Training parameters +loss_agg_mode="token-mean" + +# Algorithm +temperature=1.0 +top_p=1.0 +top_k=-1 # 0 for HF rollout, -1 for vLLM rollout +val_top_p=0.7 + +# Performance Related Parameter +use_dynamic_bsz=True +actor_ppo_max_token_len=$(((max_prompt_length + max_response_length) * 2)) +infer_ppo_max_token_len=$(((max_prompt_length + max_response_length) * 3)) +ref_offload=True +actor_offload=False +gen_tp=1 +sp_size=1 +fsdp_size=2 + +# Fully async specific parameters +NNODES=${NNODES:-2} +NGPUS_PER_NODE=${NGPUS_PER_NODE:-8} + +n_gpus_rollout=6 +n_gpus_training=$((NGPUS_PER_NODE - n_gpus_rollout)) + +train_prompt_bsz=0 +gen_prompt_bsz=1 +n_resp_per_prompt=16 +train_prompt_mini_bsz=32 +total_rollout_steps=$(((512*100))) +test_freq=10 +staleness_threshold=1 +trigger_parameter_sync_step=64 +partial_rollout=True + +PYTHON_INTERPRETER="/home/hadoop-djst-algoplat/miniconda3/bin/python" +if [ ! -x "$PYTHON_INTERPRETER" ]; then + PYTHON_INTERPRETER="python3" +fi + +$PYTHON_INTERPRETER -m recipe.fully_async_policy.fully_async_main \ + data.train_files="${TRAIN_FILE}" \ + data.val_files="${TEST_FILE}" \ + data.prompt_key=prompt \ + data.truncation='left' \ + data.max_prompt_length=${max_prompt_length} \ + data.max_response_length=${max_response_length} \ + data.train_batch_size=${train_prompt_bsz} \ + data.gen_batch_size=${gen_prompt_bsz} \ + data.return_raw_chat=${return_raw_chat} \ + actor_rollout_ref.rollout.n=${n_resp_per_prompt} \ + algorithm.adv_estimator=${adv_estimator} \ + algorithm.use_kl_in_reward=${use_kl_in_reward} \ + algorithm.kl_ctrl.kl_coef=${kl_coef} \ + actor_rollout_ref.actor.strategy=fsdp2 \ + critic.strategy=fsdp2 \ + actor_rollout_ref.actor.use_kl_loss=${use_kl_loss} \ + actor_rollout_ref.actor.kl_loss_coef=${kl_loss_coef} \ + actor_rollout_ref.actor.clip_ratio_low=${clip_ratio_low} \ + actor_rollout_ref.actor.clip_ratio_high=${clip_ratio_high} \ + actor_rollout_ref.actor.clip_ratio_c=10.0 \ + actor_rollout_ref.model.use_remove_padding=True \ + actor_rollout_ref.hybrid_engine=False \ + +actor_rollout_ref.model.override_config.max_position_embeddings=32768 \ + actor_rollout_ref.actor.use_dynamic_bsz=${use_dynamic_bsz} \ + actor_rollout_ref.ref.log_prob_use_dynamic_bsz=${use_dynamic_bsz} \ + actor_rollout_ref.rollout.log_prob_use_dynamic_bsz=${use_dynamic_bsz} \ + actor_rollout_ref.actor.ppo_max_token_len_per_gpu=${actor_ppo_max_token_len} \ + actor_rollout_ref.ref.log_prob_max_token_len_per_gpu=${infer_ppo_max_token_len} \ + actor_rollout_ref.rollout.log_prob_max_token_len_per_gpu=${infer_ppo_max_token_len} \ + actor_rollout_ref.model.path="${MODEL_PATH}" \ + actor_rollout_ref.actor.optim.lr=1e-6 \ + actor_rollout_ref.actor.optim.lr_warmup_steps=10 \ + actor_rollout_ref.actor.optim.weight_decay=0.1 \ + actor_rollout_ref.actor.ppo_mini_batch_size=${train_prompt_mini_bsz} \ + actor_rollout_ref.actor.fsdp_config.param_offload=${actor_offload} \ + actor_rollout_ref.actor.fsdp_config.optimizer_offload=${actor_offload} \ + actor_rollout_ref.actor.entropy_coeff=0 \ + actor_rollout_ref.actor.grad_clip=1.0 \ + actor_rollout_ref.actor.loss_agg_mode=${loss_agg_mode} \ + actor_rollout_ref.actor.ulysses_sequence_parallel_size=${sp_size} \ + actor_rollout_ref.rollout.gpu_memory_utilization=0.80 \ + actor_rollout_ref.rollout.tensor_model_parallel_size=${gen_tp} \ + actor_rollout_ref.rollout.enable_chunked_prefill=True \ + actor_rollout_ref.rollout.max_num_batched_tokens=$((max_prompt_length + max_response_length)) \ + actor_rollout_ref.rollout.temperature=${temperature} \ + actor_rollout_ref.rollout.top_p=${top_p} \ + actor_rollout_ref.rollout.top_k=${top_k} \ + actor_rollout_ref.rollout.val_kwargs.temperature=${temperature} \ + actor_rollout_ref.rollout.val_kwargs.top_p=${val_top_p} \ + actor_rollout_ref.rollout.val_kwargs.top_k=${top_k} \ + actor_rollout_ref.rollout.val_kwargs.do_sample=True \ + actor_rollout_ref.rollout.val_kwargs.n=1 \ + actor_rollout_ref.ref.fsdp_config.param_offload=${ref_offload} \ + actor_rollout_ref.ref.ulysses_sequence_parallel_size=${sp_size} \ + actor_rollout_ref.actor.fsdp_config.fsdp_size=${fsdp_size} \ + actor_rollout_ref.rollout.name=${rollout_name} \ + actor_rollout_ref.rollout.mode=${rollout_mode} \ + actor_rollout_ref.rollout.calculate_log_probs=True \ + reward_model.reward_manager=dapo \ + +reward_model.reward_kwargs.overlong_buffer_cfg.enable=${enable_overlong_buffer} \ + +reward_model.reward_kwargs.overlong_buffer_cfg.len=${overlong_buffer_len} \ + +reward_model.reward_kwargs.overlong_buffer_cfg.penalty_factor=${overlong_penalty_factor} \ + +reward_model.reward_kwargs.overlong_buffer_cfg.log=False \ + +reward_model.reward_kwargs.max_resp_len=${max_response_length} \ + trainer.logger=['console','tensorboard'] \ + trainer.project_name="${project_name}" \ + trainer.experiment_name="${exp_name}" \ + trainer.val_before_train=True \ + trainer.save_freq=-1 \ + trainer.default_local_dir="${CKPTS_DIR}" \ + trainer.resume_mode=auto \ + trainer.nnodes="${NNODES}" \ + trainer.n_gpus_per_node="${n_gpus_training}" \ + rollout.nnodes="${NNODES}" \ + rollout.n_gpus_per_node="${n_gpus_rollout}" \ + rollout.total_rollout_steps="${total_rollout_steps}" \ + rollout.total_epochs=10 \ + rollout.test_freq="${test_freq}" \ + async_training.staleness_threshold="${staleness_threshold}" \ + async_training.trigger_parameter_sync_step="${trigger_parameter_sync_step}" \ + async_training.partial_rollout="${partial_rollout}" diff --git a/recipe/fully_async_policy/exp/qwen2-7B-math/fsdp2_fully-async_8-24/runtime_env.yaml b/recipe/fully_async_policy/exp/qwen2-7B-math_64/fsdp2_fully-async_16-16/runtime_env.yaml similarity index 100% rename from recipe/fully_async_policy/exp/qwen2-7B-math/fsdp2_fully-async_8-24/runtime_env.yaml rename to recipe/fully_async_policy/exp/qwen2-7B-math_64/fsdp2_fully-async_16-16/runtime_env.yaml diff --git a/recipe/fully_async_policy/exp/qwen2-7B-math_64/fsdp2_fully-async_24-8/dapo_qwen2-7B-math_28k_fsdp2_fully-async_8-24_mbs32_tpf_tfq20.sh b/recipe/fully_async_policy/exp/qwen2-7B-math_64/fsdp2_fully-async_24-8/dapo_qwen2-7B-math_28k_fsdp2_fully-async_8-24_mbs32_tpf_tfq20.sh new file mode 100644 index 00000000000..618497c0257 --- /dev/null +++ b/recipe/fully_async_policy/exp/qwen2-7B-math_64/fsdp2_fully-async_24-8/dapo_qwen2-7B-math_28k_fsdp2_fully-async_8-24_mbs32_tpf_tfq20.sh @@ -0,0 +1,174 @@ +#!/usr/bin/env bash +set -xeuo pipefail +# dapo_qwen2-7B-math_28k_fsdp2_fully-async_8-24_mbs32_tpf_tfq20 +project_name='DAPO' +exp_name='dapo_qwen2-7B-math_28k_fsdp2_fully-async_8-24_mbs32_tpf_tfq20' + +# Ray +# RAY_ADDRESS=${RAY_ADDRESS:-"http://localhost:8265"} +# WORKING_DIR=${WORKING_DIR:-"${PWD}"} +# RUNTIME_ENV=${RUNTIME_ENV:-"${WORKING_DIR}/verl/trainer/runtime_env.yaml"} +# Paths +RAY_DATA_HOME=${RAY_DATA_HOME:-"${HOME}/verl"} +# very important! please modify the max_position_embeddings in config.json to 32768 after downloading from huggingface +MODEL_PATH=${MODEL_PATH:-"${RAY_DATA_HOME}/models/Qwen2.5-Math-7B"} +CKPTS_DIR=${CKPTS_DIR:-"${RAY_DATA_HOME}/ckpts/${project_name}/${exp_name}"} +TRAIN_FILE=${TRAIN_FILE:-"${RAY_DATA_HOME}/data/dapo-math-17k.parquet"} +TEST_FILE=${TEST_FILE:-"${RAY_DATA_HOME}/data/aime-2024.parquet"} + +# MODEL_PATH=/mnt/dolphinfs/hdd_pool/docker/user/hadoop-djst-algoplat/houzhenggang/modelscope/hub/models/Qwen/Qwen2___5-Math-7B +MODEL_PATH=/mnt/dolphinfs/ssd_pool/docker/user/hadoop-friday-studio/FTI/houzhenggang/model/Qwen2___5-Math-7B +CKPTS_DIR=./ckpts/${project_name}/${exp_name} +# TRAIN_FILE=/mnt/dolphinfs/hdd_pool/docker/user/hadoop-djst-algoplat/houzhenggang/data/dapo/dapo-math-17k.parquet +# TEST_FILE=/mnt/dolphinfs/hdd_pool/docker/user/hadoop-djst-algoplat/houzhenggang/data/dapo/aime-2024.parquet +TRAIN_FILE=/mnt/dolphinfs/ssd_pool/docker/user/hadoop-friday-studio/FTI/houzhenggang/data/dapo/dapo-math-17k.parquet +TEST_FILE=/mnt/dolphinfs/ssd_pool/docker/user/hadoop-friday-studio/FTI/houzhenggang/data/dapo/aime-2024.parquet + +rollout_mode="async" +rollout_name="vllm" # sglang or vllm +if [ "$rollout_mode" = "async" ]; then + export VLLM_USE_V1=1 + return_raw_chat="True" +fi + +# Algorithm parameters +adv_estimator=grpo + +use_kl_in_reward=False +kl_coef=0.0 +use_kl_loss=False +kl_loss_coef=0.0 + +clip_ratio_low=0.2 +clip_ratio_high=0.28 + +# Response length parameters +max_prompt_length=$((1024 * 2)) +max_response_length=$((1024 *8)) +enable_overlong_buffer=True +overlong_buffer_len=$((1024 * 4)) +overlong_penalty_factor=1.0 + +# Training parameters +loss_agg_mode="token-mean" + +# Algorithm +temperature=1.0 +top_p=1.0 +top_k=-1 # 0 for HF rollout, -1 for vLLM rollout +val_top_p=0.7 + +# Performance Related Parameter +use_dynamic_bsz=True +actor_ppo_max_token_len=$(((max_prompt_length + max_response_length) * 2)) +infer_ppo_max_token_len=$(((max_prompt_length + max_response_length) * 3)) +ref_offload=True +actor_offload=False +gen_tp=1 +sp_size=1 +fsdp_size=2 + +# Fully async specific parameters +NNODES=${NNODES:-2} +NGPUS_PER_NODE=${NGPUS_PER_NODE:-8} + +n_gpus_rollout=6 +n_gpus_training=$((NGPUS_PER_NODE - n_gpus_rollout)) + +train_prompt_bsz=0 +gen_prompt_bsz=1 +n_resp_per_prompt=16 +train_prompt_mini_bsz=32 +total_rollout_steps=$(((512*100))) +test_freq=10 +staleness_threshold=1 +trigger_parameter_sync_step=64 +partial_rollout=True + +PYTHON_INTERPRETER="/home/hadoop-djst-algoplat/miniconda3/bin/python" +if [ ! -x "$PYTHON_INTERPRETER" ]; then + PYTHON_INTERPRETER="python3" +fi + +$PYTHON_INTERPRETER -m recipe.fully_async_policy.fully_async_main \ + data.train_files="${TRAIN_FILE}" \ + data.val_files="${TEST_FILE}" \ + data.prompt_key=prompt \ + data.truncation='left' \ + data.max_prompt_length=${max_prompt_length} \ + data.max_response_length=${max_response_length} \ + data.train_batch_size=${train_prompt_bsz} \ + data.gen_batch_size=${gen_prompt_bsz} \ + data.return_raw_chat=${return_raw_chat} \ + actor_rollout_ref.rollout.n=${n_resp_per_prompt} \ + algorithm.adv_estimator=${adv_estimator} \ + algorithm.use_kl_in_reward=${use_kl_in_reward} \ + algorithm.kl_ctrl.kl_coef=${kl_coef} \ + actor_rollout_ref.actor.strategy=fsdp2 \ + critic.strategy=fsdp2 \ + actor_rollout_ref.actor.use_kl_loss=${use_kl_loss} \ + actor_rollout_ref.actor.kl_loss_coef=${kl_loss_coef} \ + actor_rollout_ref.actor.clip_ratio_low=${clip_ratio_low} \ + actor_rollout_ref.actor.clip_ratio_high=${clip_ratio_high} \ + actor_rollout_ref.actor.clip_ratio_c=10.0 \ + actor_rollout_ref.model.use_remove_padding=True \ + actor_rollout_ref.hybrid_engine=False \ + +actor_rollout_ref.model.override_config.max_position_embeddings=32768 \ + actor_rollout_ref.actor.use_dynamic_bsz=${use_dynamic_bsz} \ + actor_rollout_ref.ref.log_prob_use_dynamic_bsz=${use_dynamic_bsz} \ + actor_rollout_ref.rollout.log_prob_use_dynamic_bsz=${use_dynamic_bsz} \ + actor_rollout_ref.actor.ppo_max_token_len_per_gpu=${actor_ppo_max_token_len} \ + actor_rollout_ref.ref.log_prob_max_token_len_per_gpu=${infer_ppo_max_token_len} \ + actor_rollout_ref.rollout.log_prob_max_token_len_per_gpu=${infer_ppo_max_token_len} \ + actor_rollout_ref.model.path="${MODEL_PATH}" \ + actor_rollout_ref.actor.optim.lr=1e-6 \ + actor_rollout_ref.actor.optim.lr_warmup_steps=10 \ + actor_rollout_ref.actor.optim.weight_decay=0.1 \ + actor_rollout_ref.actor.ppo_mini_batch_size=${train_prompt_mini_bsz} \ + actor_rollout_ref.actor.fsdp_config.param_offload=${actor_offload} \ + actor_rollout_ref.actor.fsdp_config.optimizer_offload=${actor_offload} \ + actor_rollout_ref.actor.entropy_coeff=0 \ + actor_rollout_ref.actor.grad_clip=1.0 \ + actor_rollout_ref.actor.loss_agg_mode=${loss_agg_mode} \ + actor_rollout_ref.actor.ulysses_sequence_parallel_size=${sp_size} \ + actor_rollout_ref.rollout.gpu_memory_utilization=0.80 \ + actor_rollout_ref.rollout.tensor_model_parallel_size=${gen_tp} \ + actor_rollout_ref.rollout.enable_chunked_prefill=True \ + actor_rollout_ref.rollout.max_num_batched_tokens=$((max_prompt_length + max_response_length)) \ + actor_rollout_ref.rollout.temperature=${temperature} \ + actor_rollout_ref.rollout.top_p=${top_p} \ + actor_rollout_ref.rollout.top_k=${top_k} \ + actor_rollout_ref.rollout.val_kwargs.temperature=${temperature} \ + actor_rollout_ref.rollout.val_kwargs.top_p=${val_top_p} \ + actor_rollout_ref.rollout.val_kwargs.top_k=${top_k} \ + actor_rollout_ref.rollout.val_kwargs.do_sample=True \ + actor_rollout_ref.rollout.val_kwargs.n=1 \ + actor_rollout_ref.ref.fsdp_config.param_offload=${ref_offload} \ + actor_rollout_ref.ref.ulysses_sequence_parallel_size=${sp_size} \ + actor_rollout_ref.actor.fsdp_config.fsdp_size=${fsdp_size} \ + actor_rollout_ref.rollout.name=${rollout_name} \ + actor_rollout_ref.rollout.mode=${rollout_mode} \ + actor_rollout_ref.rollout.calculate_log_probs=True \ + reward_model.reward_manager=dapo \ + +reward_model.reward_kwargs.overlong_buffer_cfg.enable=${enable_overlong_buffer} \ + +reward_model.reward_kwargs.overlong_buffer_cfg.len=${overlong_buffer_len} \ + +reward_model.reward_kwargs.overlong_buffer_cfg.penalty_factor=${overlong_penalty_factor} \ + +reward_model.reward_kwargs.overlong_buffer_cfg.log=False \ + +reward_model.reward_kwargs.max_resp_len=${max_response_length} \ + trainer.logger=['console','tensorboard'] \ + trainer.project_name="${project_name}" \ + trainer.experiment_name="${exp_name}" \ + trainer.val_before_train=True \ + trainer.save_freq=-1 \ + trainer.default_local_dir="${CKPTS_DIR}" \ + trainer.resume_mode=auto \ + trainer.nnodes="${NNODES}" \ + trainer.n_gpus_per_node="${n_gpus_training}" \ + rollout.nnodes="${NNODES}" \ + rollout.n_gpus_per_node="${n_gpus_rollout}" \ + rollout.total_rollout_steps="${total_rollout_steps}" \ + rollout.total_epochs=10 \ + rollout.test_freq="${test_freq}" \ + async_training.staleness_threshold="${staleness_threshold}" \ + async_training.trigger_parameter_sync_step="${trigger_parameter_sync_step}" \ + async_training.partial_rollout="${partial_rollout}" diff --git a/recipe/fully_async_policy/exp/qwen2-7B-math_64/fsdp2_fully-async_24-8/runtime_env.yaml b/recipe/fully_async_policy/exp/qwen2-7B-math_64/fsdp2_fully-async_24-8/runtime_env.yaml new file mode 100644 index 00000000000..dcca08e67f7 --- /dev/null +++ b/recipe/fully_async_policy/exp/qwen2-7B-math_64/fsdp2_fully-async_24-8/runtime_env.yaml @@ -0,0 +1,5 @@ +env_vars: + VLLM_USE_V1: "1" + TENSORBOARD_DIR: "/mnt/dolphinfs/hdd_pool/docker/user/hadoop-djst-algoplat/houzhenggang/tensorboard/dapo_7b_math_fsdp2_4_12" + NCCL_DEBUG: "INFO" + HYDRA_FULL_ERROR: "1" \ No newline at end of file diff --git a/recipe/fully_async_policy/exp/qwen2-7B-math_64/fsdp2_fully-async_8-24/dapo_qwen2-7B-math_28k_fsdp2_fully-async_8-24_mbs32_tpf_tfq20.sh b/recipe/fully_async_policy/exp/qwen2-7B-math_64/fsdp2_fully-async_8-24/dapo_qwen2-7B-math_28k_fsdp2_fully-async_8-24_mbs32_tpf_tfq20.sh new file mode 100644 index 00000000000..618497c0257 --- /dev/null +++ b/recipe/fully_async_policy/exp/qwen2-7B-math_64/fsdp2_fully-async_8-24/dapo_qwen2-7B-math_28k_fsdp2_fully-async_8-24_mbs32_tpf_tfq20.sh @@ -0,0 +1,174 @@ +#!/usr/bin/env bash +set -xeuo pipefail +# dapo_qwen2-7B-math_28k_fsdp2_fully-async_8-24_mbs32_tpf_tfq20 +project_name='DAPO' +exp_name='dapo_qwen2-7B-math_28k_fsdp2_fully-async_8-24_mbs32_tpf_tfq20' + +# Ray +# RAY_ADDRESS=${RAY_ADDRESS:-"http://localhost:8265"} +# WORKING_DIR=${WORKING_DIR:-"${PWD}"} +# RUNTIME_ENV=${RUNTIME_ENV:-"${WORKING_DIR}/verl/trainer/runtime_env.yaml"} +# Paths +RAY_DATA_HOME=${RAY_DATA_HOME:-"${HOME}/verl"} +# very important! please modify the max_position_embeddings in config.json to 32768 after downloading from huggingface +MODEL_PATH=${MODEL_PATH:-"${RAY_DATA_HOME}/models/Qwen2.5-Math-7B"} +CKPTS_DIR=${CKPTS_DIR:-"${RAY_DATA_HOME}/ckpts/${project_name}/${exp_name}"} +TRAIN_FILE=${TRAIN_FILE:-"${RAY_DATA_HOME}/data/dapo-math-17k.parquet"} +TEST_FILE=${TEST_FILE:-"${RAY_DATA_HOME}/data/aime-2024.parquet"} + +# MODEL_PATH=/mnt/dolphinfs/hdd_pool/docker/user/hadoop-djst-algoplat/houzhenggang/modelscope/hub/models/Qwen/Qwen2___5-Math-7B +MODEL_PATH=/mnt/dolphinfs/ssd_pool/docker/user/hadoop-friday-studio/FTI/houzhenggang/model/Qwen2___5-Math-7B +CKPTS_DIR=./ckpts/${project_name}/${exp_name} +# TRAIN_FILE=/mnt/dolphinfs/hdd_pool/docker/user/hadoop-djst-algoplat/houzhenggang/data/dapo/dapo-math-17k.parquet +# TEST_FILE=/mnt/dolphinfs/hdd_pool/docker/user/hadoop-djst-algoplat/houzhenggang/data/dapo/aime-2024.parquet +TRAIN_FILE=/mnt/dolphinfs/ssd_pool/docker/user/hadoop-friday-studio/FTI/houzhenggang/data/dapo/dapo-math-17k.parquet +TEST_FILE=/mnt/dolphinfs/ssd_pool/docker/user/hadoop-friday-studio/FTI/houzhenggang/data/dapo/aime-2024.parquet + +rollout_mode="async" +rollout_name="vllm" # sglang or vllm +if [ "$rollout_mode" = "async" ]; then + export VLLM_USE_V1=1 + return_raw_chat="True" +fi + +# Algorithm parameters +adv_estimator=grpo + +use_kl_in_reward=False +kl_coef=0.0 +use_kl_loss=False +kl_loss_coef=0.0 + +clip_ratio_low=0.2 +clip_ratio_high=0.28 + +# Response length parameters +max_prompt_length=$((1024 * 2)) +max_response_length=$((1024 *8)) +enable_overlong_buffer=True +overlong_buffer_len=$((1024 * 4)) +overlong_penalty_factor=1.0 + +# Training parameters +loss_agg_mode="token-mean" + +# Algorithm +temperature=1.0 +top_p=1.0 +top_k=-1 # 0 for HF rollout, -1 for vLLM rollout +val_top_p=0.7 + +# Performance Related Parameter +use_dynamic_bsz=True +actor_ppo_max_token_len=$(((max_prompt_length + max_response_length) * 2)) +infer_ppo_max_token_len=$(((max_prompt_length + max_response_length) * 3)) +ref_offload=True +actor_offload=False +gen_tp=1 +sp_size=1 +fsdp_size=2 + +# Fully async specific parameters +NNODES=${NNODES:-2} +NGPUS_PER_NODE=${NGPUS_PER_NODE:-8} + +n_gpus_rollout=6 +n_gpus_training=$((NGPUS_PER_NODE - n_gpus_rollout)) + +train_prompt_bsz=0 +gen_prompt_bsz=1 +n_resp_per_prompt=16 +train_prompt_mini_bsz=32 +total_rollout_steps=$(((512*100))) +test_freq=10 +staleness_threshold=1 +trigger_parameter_sync_step=64 +partial_rollout=True + +PYTHON_INTERPRETER="/home/hadoop-djst-algoplat/miniconda3/bin/python" +if [ ! -x "$PYTHON_INTERPRETER" ]; then + PYTHON_INTERPRETER="python3" +fi + +$PYTHON_INTERPRETER -m recipe.fully_async_policy.fully_async_main \ + data.train_files="${TRAIN_FILE}" \ + data.val_files="${TEST_FILE}" \ + data.prompt_key=prompt \ + data.truncation='left' \ + data.max_prompt_length=${max_prompt_length} \ + data.max_response_length=${max_response_length} \ + data.train_batch_size=${train_prompt_bsz} \ + data.gen_batch_size=${gen_prompt_bsz} \ + data.return_raw_chat=${return_raw_chat} \ + actor_rollout_ref.rollout.n=${n_resp_per_prompt} \ + algorithm.adv_estimator=${adv_estimator} \ + algorithm.use_kl_in_reward=${use_kl_in_reward} \ + algorithm.kl_ctrl.kl_coef=${kl_coef} \ + actor_rollout_ref.actor.strategy=fsdp2 \ + critic.strategy=fsdp2 \ + actor_rollout_ref.actor.use_kl_loss=${use_kl_loss} \ + actor_rollout_ref.actor.kl_loss_coef=${kl_loss_coef} \ + actor_rollout_ref.actor.clip_ratio_low=${clip_ratio_low} \ + actor_rollout_ref.actor.clip_ratio_high=${clip_ratio_high} \ + actor_rollout_ref.actor.clip_ratio_c=10.0 \ + actor_rollout_ref.model.use_remove_padding=True \ + actor_rollout_ref.hybrid_engine=False \ + +actor_rollout_ref.model.override_config.max_position_embeddings=32768 \ + actor_rollout_ref.actor.use_dynamic_bsz=${use_dynamic_bsz} \ + actor_rollout_ref.ref.log_prob_use_dynamic_bsz=${use_dynamic_bsz} \ + actor_rollout_ref.rollout.log_prob_use_dynamic_bsz=${use_dynamic_bsz} \ + actor_rollout_ref.actor.ppo_max_token_len_per_gpu=${actor_ppo_max_token_len} \ + actor_rollout_ref.ref.log_prob_max_token_len_per_gpu=${infer_ppo_max_token_len} \ + actor_rollout_ref.rollout.log_prob_max_token_len_per_gpu=${infer_ppo_max_token_len} \ + actor_rollout_ref.model.path="${MODEL_PATH}" \ + actor_rollout_ref.actor.optim.lr=1e-6 \ + actor_rollout_ref.actor.optim.lr_warmup_steps=10 \ + actor_rollout_ref.actor.optim.weight_decay=0.1 \ + actor_rollout_ref.actor.ppo_mini_batch_size=${train_prompt_mini_bsz} \ + actor_rollout_ref.actor.fsdp_config.param_offload=${actor_offload} \ + actor_rollout_ref.actor.fsdp_config.optimizer_offload=${actor_offload} \ + actor_rollout_ref.actor.entropy_coeff=0 \ + actor_rollout_ref.actor.grad_clip=1.0 \ + actor_rollout_ref.actor.loss_agg_mode=${loss_agg_mode} \ + actor_rollout_ref.actor.ulysses_sequence_parallel_size=${sp_size} \ + actor_rollout_ref.rollout.gpu_memory_utilization=0.80 \ + actor_rollout_ref.rollout.tensor_model_parallel_size=${gen_tp} \ + actor_rollout_ref.rollout.enable_chunked_prefill=True \ + actor_rollout_ref.rollout.max_num_batched_tokens=$((max_prompt_length + max_response_length)) \ + actor_rollout_ref.rollout.temperature=${temperature} \ + actor_rollout_ref.rollout.top_p=${top_p} \ + actor_rollout_ref.rollout.top_k=${top_k} \ + actor_rollout_ref.rollout.val_kwargs.temperature=${temperature} \ + actor_rollout_ref.rollout.val_kwargs.top_p=${val_top_p} \ + actor_rollout_ref.rollout.val_kwargs.top_k=${top_k} \ + actor_rollout_ref.rollout.val_kwargs.do_sample=True \ + actor_rollout_ref.rollout.val_kwargs.n=1 \ + actor_rollout_ref.ref.fsdp_config.param_offload=${ref_offload} \ + actor_rollout_ref.ref.ulysses_sequence_parallel_size=${sp_size} \ + actor_rollout_ref.actor.fsdp_config.fsdp_size=${fsdp_size} \ + actor_rollout_ref.rollout.name=${rollout_name} \ + actor_rollout_ref.rollout.mode=${rollout_mode} \ + actor_rollout_ref.rollout.calculate_log_probs=True \ + reward_model.reward_manager=dapo \ + +reward_model.reward_kwargs.overlong_buffer_cfg.enable=${enable_overlong_buffer} \ + +reward_model.reward_kwargs.overlong_buffer_cfg.len=${overlong_buffer_len} \ + +reward_model.reward_kwargs.overlong_buffer_cfg.penalty_factor=${overlong_penalty_factor} \ + +reward_model.reward_kwargs.overlong_buffer_cfg.log=False \ + +reward_model.reward_kwargs.max_resp_len=${max_response_length} \ + trainer.logger=['console','tensorboard'] \ + trainer.project_name="${project_name}" \ + trainer.experiment_name="${exp_name}" \ + trainer.val_before_train=True \ + trainer.save_freq=-1 \ + trainer.default_local_dir="${CKPTS_DIR}" \ + trainer.resume_mode=auto \ + trainer.nnodes="${NNODES}" \ + trainer.n_gpus_per_node="${n_gpus_training}" \ + rollout.nnodes="${NNODES}" \ + rollout.n_gpus_per_node="${n_gpus_rollout}" \ + rollout.total_rollout_steps="${total_rollout_steps}" \ + rollout.total_epochs=10 \ + rollout.test_freq="${test_freq}" \ + async_training.staleness_threshold="${staleness_threshold}" \ + async_training.trigger_parameter_sync_step="${trigger_parameter_sync_step}" \ + async_training.partial_rollout="${partial_rollout}" diff --git a/recipe/fully_async_policy/exp/qwen2-7B-math_64/fsdp2_fully-async_8-24/runtime_env.yaml b/recipe/fully_async_policy/exp/qwen2-7B-math_64/fsdp2_fully-async_8-24/runtime_env.yaml new file mode 100644 index 00000000000..dcca08e67f7 --- /dev/null +++ b/recipe/fully_async_policy/exp/qwen2-7B-math_64/fsdp2_fully-async_8-24/runtime_env.yaml @@ -0,0 +1,5 @@ +env_vars: + VLLM_USE_V1: "1" + TENSORBOARD_DIR: "/mnt/dolphinfs/hdd_pool/docker/user/hadoop-djst-algoplat/houzhenggang/tensorboard/dapo_7b_math_fsdp2_4_12" + NCCL_DEBUG: "INFO" + HYDRA_FULL_ERROR: "1" \ No newline at end of file diff --git a/recipe/fully_async_policy/exp/qwen2-7B-math_64/megatron_colocate/dapo_7b_math_megatron_colocate.sh b/recipe/fully_async_policy/exp/qwen2-7B-math_64/megatron_colocate/dapo_7b_math_megatron_colocate.sh new file mode 100644 index 00000000000..7444ec90c99 --- /dev/null +++ b/recipe/fully_async_policy/exp/qwen2-7B-math_64/megatron_colocate/dapo_7b_math_megatron_colocate.sh @@ -0,0 +1,135 @@ +#!/usr/bin/env bash +set -xeuo pipefail + +project_name='DAPO' +exp_name='dapo_qwen2-7B-math_28k_megatron_colocate_64_mbs32' + +adv_estimator=grpo + +use_kl_in_reward=False +kl_coef=0.0 +use_kl_loss=False +kl_loss_coef=0.0 + +clip_ratio_low=0.2 +clip_ratio_high=0.28 + +max_prompt_length=$((1024 * 2)) +max_response_length=$((1024 * 28)) +enable_overlong_buffer=True +overlong_buffer_len=$((1024 * 4)) +overlong_penalty_factor=1.0 + +loss_agg_mode="token-mean" + +train_prompt_bsz=512 +n_resp_per_prompt=16 +train_prompt_mini_bsz=32 + +# Ray +# RAY_ADDRESS=${RAY_ADDRESS:-"http://localhost:8265"} +# WORKING_DIR=${WORKING_DIR:-"${PWD}"} +# RUNTIME_ENV=${RUNTIME_ENV:-"${WORKING_DIR}/verl/trainer/runtime_env.yaml"} +NNODES=${NNODES:-8} +NGPUS_PER_NODE=${NGPUS_PER_NODE:-8} +# Paths +MODEL_PATH=/mnt/dolphinfs/ssd_pool/docker/user/hadoop-friday-studio/FTI/houzhenggang/model/Qwen2___5-Math-7B +CKPTS_DIR=./ckpts/${project_name}/${exp_name} +TRAIN_FILE=/mnt/dolphinfs/ssd_pool/docker/user/hadoop-friday-studio/FTI/houzhenggang/data/dapo/dapo-math-17k.parquet +TEST_FILE=/mnt/dolphinfs/ssd_pool/docker/user/hadoop-friday-studio/FTI/houzhenggang/data/dapo/aime-2024.parquet + +# Algorithm +temperature=1.0 +top_p=1.0 +top_k=-1 # 0 for HF rollout, -1 for vLLM rollout +val_top_p=0.7 + +# Performance Related Parameter +use_dynamic_bsz=True +actor_ppo_max_token_len=$(((max_prompt_length + max_response_length) * 2)) +infer_ppo_max_token_len=$(((max_prompt_length + max_response_length) * 3)) +offload=True +gen_tp=4 +train_tp=2 +train_pp=1 + +# TODO: support dynamic_bsz for megatron +# actor_rollout_ref.actor.use_dynamic_bsz=${use_dynamic_bsz} \ +# actor_rollout_ref.ref.log_prob_use_dynamic_bsz=${use_dynamic_bsz} \ +# actor_rollout_ref.rollout.log_prob_use_dynamic_bsz=${use_dynamic_bsz} \ +# actor_rollout_ref.actor.ppo_max_token_len_per_gpu=${actor_ppo_max_token_len} \ +# actor_rollout_ref.ref.log_prob_max_token_len_per_gpu=${infer_ppo_max_token_len} \ +# actor_rollout_ref.rollout.log_prob_max_token_len_per_gpu=${infer_ppo_max_token_len} \ + +python3 -m verl.trainer.main_ppo \ + --config-path=config \ + --config-name='ppo_megatron_trainer.yaml' \ + data.train_files="${TRAIN_FILE}" \ + data.val_files="${TEST_FILE}" \ + data.prompt_key=prompt \ + data.truncation='left' \ + data.max_prompt_length=${max_prompt_length} \ + data.max_response_length=${max_response_length} \ + data.train_batch_size=${train_prompt_bsz} \ + actor_rollout_ref.rollout.n=${n_resp_per_prompt} \ + algorithm.adv_estimator=${adv_estimator} \ + algorithm.use_kl_in_reward=${use_kl_in_reward} \ + algorithm.kl_ctrl.kl_coef=${kl_coef} \ + actor_rollout_ref.actor.strategy=megatron \ + critic.strategy=megatron \ + actor_rollout_ref.actor.use_kl_loss=${use_kl_loss} \ + actor_rollout_ref.actor.kl_loss_coef=${kl_loss_coef} \ + actor_rollout_ref.actor.clip_ratio_low=${clip_ratio_low} \ + actor_rollout_ref.actor.clip_ratio_high=${clip_ratio_high} \ + actor_rollout_ref.actor.clip_ratio_c=10.0 \ + +actor_rollout_ref.model.override_config.max_position_embeddings=32768 \ + actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=2 \ + actor_rollout_ref.ref.log_prob_micro_batch_size_per_gpu=4 \ + actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu=4 \ + actor_rollout_ref.model.path="${MODEL_PATH}" \ + actor_rollout_ref.actor.optim.lr=1e-6 \ + actor_rollout_ref.actor.optim.lr_warmup_steps=10 \ + actor_rollout_ref.actor.optim.weight_decay=0.1 \ + actor_rollout_ref.actor.ppo_mini_batch_size=${train_prompt_mini_bsz} \ + actor_rollout_ref.actor.megatron.param_offload=${offload} \ + actor_rollout_ref.actor.megatron.optimizer_offload=${offload} \ + actor_rollout_ref.actor.megatron.grad_offload=${offload} \ + actor_rollout_ref.actor.megatron.pipeline_model_parallel_size=${train_pp} \ + actor_rollout_ref.actor.megatron.tensor_model_parallel_size=${train_tp} \ + actor_rollout_ref.actor.entropy_coeff=0 \ + actor_rollout_ref.actor.optim.clip_grad=1.0 \ + actor_rollout_ref.actor.loss_agg_mode=${loss_agg_mode} \ + actor_rollout_ref.rollout.gpu_memory_utilization=0.80 \ + actor_rollout_ref.rollout.tensor_model_parallel_size=${gen_tp} \ + actor_rollout_ref.rollout.enable_chunked_prefill=True \ + actor_rollout_ref.rollout.max_num_batched_tokens=$((max_prompt_length + max_response_length)) \ + actor_rollout_ref.rollout.temperature=${temperature} \ + actor_rollout_ref.rollout.top_p=${top_p} \ + actor_rollout_ref.rollout.top_k=${top_k} \ + actor_rollout_ref.rollout.val_kwargs.temperature=${temperature} \ + actor_rollout_ref.rollout.val_kwargs.top_p=${val_top_p} \ + actor_rollout_ref.rollout.val_kwargs.top_k=${top_k} \ + actor_rollout_ref.rollout.val_kwargs.do_sample=True \ + actor_rollout_ref.rollout.val_kwargs.n=1 \ + actor_rollout_ref.ref.megatron.pipeline_model_parallel_size=${train_pp} \ + actor_rollout_ref.ref.megatron.tensor_model_parallel_size=${train_tp} \ + actor_rollout_ref.ref.megatron.param_offload=${offload} \ + reward_model.reward_manager=dapo \ + +reward_model.reward_kwargs.overlong_buffer_cfg.enable=${enable_overlong_buffer} \ + +reward_model.reward_kwargs.overlong_buffer_cfg.len=${overlong_buffer_len} \ + +reward_model.reward_kwargs.overlong_buffer_cfg.penalty_factor=${overlong_penalty_factor} \ + +reward_model.reward_kwargs.overlong_buffer_cfg.log=False \ + +reward_model.reward_kwargs.max_resp_len=${max_response_length} \ + trainer.logger=['console','tensorboard'] \ + trainer.project_name="${project_name}" \ + trainer.experiment_name="${exp_name}" \ + trainer.n_gpus_per_node=8 \ + trainer.nnodes="${NNODES}" \ + trainer.val_before_train=True \ + trainer.test_freq=20 \ + trainer.save_freq=-1 \ + trainer.total_epochs=10 \ + trainer.total_training_steps=400 \ + trainer.default_local_dir="${CKPTS_DIR}" \ + trainer.resume_mode=auto \ + trainer.log_val_generations=10 diff --git a/recipe/fully_async_policy/exp/qwen2-7B-math_64/megatron_colocate/runtime_env.yaml b/recipe/fully_async_policy/exp/qwen2-7B-math_64/megatron_colocate/runtime_env.yaml new file mode 100644 index 00000000000..a8cd045e180 --- /dev/null +++ b/recipe/fully_async_policy/exp/qwen2-7B-math_64/megatron_colocate/runtime_env.yaml @@ -0,0 +1,3 @@ +env_vars: + TENSORBOARD_DIR: "/mnt/dolphinfs/ssd_pool/docker/user/hadoop-friday-studio/FTI/houzhenggang/expeirments/qwen2-7B-math-64/dapo_qwen2-7B-math_28k_megatron_colocate_64_mbs32" + HYDRA_FULL_ERROR: "1" \ No newline at end of file diff --git a/recipe/fully_async_policy/exp/qwen3-32B_32/megatron_colocate/early_megatron_colocate.sh b/recipe/fully_async_policy/exp/qwen3-32B_32/megatron_colocate/early_megatron_colocate.sh new file mode 100644 index 00000000000..55e8733a9fb --- /dev/null +++ b/recipe/fully_async_policy/exp/qwen3-32B_32/megatron_colocate/early_megatron_colocate.sh @@ -0,0 +1,154 @@ +#!/usr/bin/env bash +set -xeuo pipefail + +project_name='DAPO' +exp_name='dapo_qwen3-32B_32k_megatron_colocate_32_mbs32' + +adv_estimator=grpo + +use_kl_in_reward=False +kl_coef=0.0 +use_kl_loss=False +kl_loss_coef=0.0 + +clip_ratio_low=0.2 +clip_ratio_high=0.28 + +max_prompt_length=$((1024 * 2)) +max_response_length=$((1024 * 32)) +enable_overlong_buffer=True +overlong_buffer_len=$((1024 * 4)) +overlong_penalty_factor=1.0 + +loss_agg_mode="token-mean" + +train_prompt_bsz=512 +train_prompt_mini_bsz=32 +n_resp_per_prompt=16 + +NNODES=${NNODES:-4} +NGPUS_PER_NODE=${NGPUS_PER_NODE:-8} +# Paths +RAY_DATA_HOME=${RAY_DATA_HOME:-"${HOME}/verl"} +MODEL_PATH=/mnt/dolphinfs/ssd_pool/docker/user/hadoop-friday-studio/.friday/models/basemodel/Qwen/Qwen3-32B +CKPTS_DIR=./ckpts/${project_name}/${exp_name} +TRAIN_FILE=/mnt/dolphinfs/ssd_pool/docker/user/hadoop-friday-studio/FTI/houzhenggang/data/dapo/dapo-math-17k.parquet +TEST_FILE=/mnt/dolphinfs/ssd_pool/docker/user/hadoop-friday-studio/FTI/houzhenggang/data/dapo/aime-2024.parquet + +# Algorithm +temperature=1.0 +top_p=1.0 +top_k=-1 # 0 for HF rollout, -1 for vLLM rollout +val_top_p=0.7 + + +# Performance Related Parameter +use_dynamic_bsz=True +actor_ppo_max_token_len=$((max_prompt_length + max_response_length)) +infer_ppo_max_token_len=$((max_prompt_length + max_response_length)) +offload=True +gen_tp=4 +train_tp=4 +train_pp=2 +EP=1 +ETP=1 +CP=1 + +python3 -m verl.trainer.main_ppo \ + --config-path=config \ + --config-name='ppo_megatron_trainer.yaml' \ + data.train_files="${TRAIN_FILE}" \ + data.val_files="${TEST_FILE}" \ + data.prompt_key=prompt \ + data.truncation='left' \ + data.max_prompt_length=${max_prompt_length} \ + data.max_response_length=${max_response_length} \ + data.train_batch_size=${train_prompt_bsz} \ + actor_rollout_ref.rollout.n=${n_resp_per_prompt} \ + algorithm.adv_estimator=${adv_estimator} \ + algorithm.use_kl_in_reward=${use_kl_in_reward} \ + algorithm.kl_ctrl.kl_coef=${kl_coef} \ + actor_rollout_ref.actor.use_kl_loss=${use_kl_loss} \ + actor_rollout_ref.actor.kl_loss_coef=${kl_loss_coef} \ + actor_rollout_ref.actor.clip_ratio_low=${clip_ratio_low} \ + actor_rollout_ref.actor.clip_ratio_high=${clip_ratio_high} \ + actor_rollout_ref.actor.clip_ratio_c=10.0 \ + actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=2 \ + actor_rollout_ref.ref.log_prob_micro_batch_size_per_gpu=4 \ + actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu=4 \ + actor_rollout_ref.model.path="${MODEL_PATH}" \ + actor_rollout_ref.actor.optim.lr=1e-6 \ + actor_rollout_ref.actor.optim.lr_warmup_steps=10 \ + actor_rollout_ref.actor.optim.weight_decay=0.1 \ + actor_rollout_ref.actor.ppo_mini_batch_size=${train_prompt_mini_bsz} \ + actor_rollout_ref.actor.entropy_coeff=0 \ + actor_rollout_ref.actor.optim.clip_grad=1.0 \ + actor_rollout_ref.actor.loss_agg_mode=${loss_agg_mode} \ + actor_rollout_ref.rollout.gpu_memory_utilization=0.80 \ + actor_rollout_ref.rollout.tensor_model_parallel_size=${gen_tp} \ + actor_rollout_ref.rollout.enable_chunked_prefill=True \ + actor_rollout_ref.rollout.max_num_batched_tokens=$((max_prompt_length + max_response_length)) \ + actor_rollout_ref.rollout.temperature=${temperature} \ + actor_rollout_ref.rollout.top_p=${top_p} \ + actor_rollout_ref.rollout.top_k=${top_k} \ + actor_rollout_ref.rollout.val_kwargs.temperature=${temperature} \ + actor_rollout_ref.rollout.val_kwargs.top_p=${val_top_p} \ + actor_rollout_ref.rollout.val_kwargs.top_k=${top_k} \ + actor_rollout_ref.rollout.val_kwargs.do_sample=True \ + actor_rollout_ref.rollout.val_kwargs.n=1 \ + reward_model.reward_manager=dapo \ + +reward_model.reward_kwargs.overlong_buffer_cfg.enable=${enable_overlong_buffer} \ + +reward_model.reward_kwargs.overlong_buffer_cfg.len=${overlong_buffer_len} \ + +reward_model.reward_kwargs.overlong_buffer_cfg.penalty_factor=${overlong_penalty_factor} \ + +reward_model.reward_kwargs.overlong_buffer_cfg.log=False \ + +reward_model.reward_kwargs.max_resp_len=${max_response_length} \ + trainer.logger='["console","tensorboard"]' \ + trainer.project_name="${project_name}" \ + trainer.experiment_name="${exp_name}" \ + trainer.n_gpus_per_node="${NGPUS_PER_NODE}" \ + trainer.nnodes="${NNODES}" \ + trainer.val_before_train=True \ + trainer.test_freq=20 \ + trainer.save_freq=-1 \ + trainer.total_epochs=10 \ + trainer.total_training_steps=400 \ + trainer.default_local_dir="${CKPTS_DIR}" \ + trainer.resume_mode=auto \ + trainer.log_val_generations=10 \ + actor_rollout_ref.actor.megatron.param_offload=${offload} \ + actor_rollout_ref.actor.megatron.optimizer_offload=${offload} \ + actor_rollout_ref.actor.megatron.grad_offload=${offload} \ + actor_rollout_ref.actor.megatron.pipeline_model_parallel_size=${train_pp} \ + actor_rollout_ref.actor.megatron.tensor_model_parallel_size=${train_tp} \ + actor_rollout_ref.actor.megatron.expert_model_parallel_size=${EP} \ + actor_rollout_ref.actor.megatron.expert_tensor_parallel_size=${ETP} \ + actor_rollout_ref.actor.megatron.context_parallel_size=${CP} \ + +actor_rollout_ref.actor.megatron.override_transformer_config.apply_rope_fusion=True \ + +actor_rollout_ref.actor.megatron.override_transformer_config.recompute_granularity=selective \ + +actor_rollout_ref.actor.megatron.override_transformer_config.recompute_modules=["core_attn","layernorm","mlp"] \ + +actor_rollout_ref.actor.megatron.override_transformer_config.gradient_accumulation_fusion=True \ + +actor_rollout_ref.actor.megatron.override_transformer_config.account_for_embedding_in_pipeline_split=False \ + +actor_rollout_ref.actor.megatron.override_transformer_config.account_for_loss_in_pipeline_split=False \ + actor_rollout_ref.ref.megatron.param_offload=${offload} \ + actor_rollout_ref.ref.megatron.pipeline_model_parallel_size=${train_pp} \ + actor_rollout_ref.ref.megatron.tensor_model_parallel_size=${train_tp} \ + actor_rollout_ref.ref.megatron.expert_model_parallel_size=${EP} \ + actor_rollout_ref.ref.megatron.expert_tensor_parallel_size=${ETP} \ + actor_rollout_ref.ref.megatron.context_parallel_size=${CP} \ + actor_rollout_ref.actor.megatron.use_mbridge=True + + # +actor_rollout_ref.actor.megatron.override_transformer_config.apply_rope_fusion=False \ + # +actor_rollout_ref.actor.megatron.override_transformer_config.moe_router_dtype=fp32 \ + # +actor_rollout_ref.actor.megatron.override_transformer_config.moe_shared_expert_overlap=False \ + # +actor_rollout_ref.actor.megatron.override_transformer_config.moe_enable_deepep=False \ + # +actor_rollout_ref.actor.megatron.override_transformer_config.moe_token_dispatcher_type=alltoall \ + # +actor_rollout_ref.actor.megatron.override_transformer_config.recompute_method=uniform \ + # +actor_rollout_ref.actor.megatron.override_transformer_config.recompute_granularity=full \ + # actor_rollout_ref.actor.megatron.override_transformer_config.recompute_granularity="selective" \ + # actor_rollout_ref.actor.megatron.override_transformer_config.recompute_modules=["core_attn","moe_act","layernorm","mlp","moe"] \ + # +actor_rollout_ref.actor.megatron.override_transformer_config.recompute_num_layers=1 \ + # +actor_rollout_ref.actor.megatron.override_transformer_config.gradient_accumulation_fusion=True \ + # +actor_rollout_ref.actor.megatron.override_transformer_config.moe_permute_fusion=True \ + # +actor_rollout_ref.actor.megatron.override_transformer_config.account_for_embedding_in_pipeline_split=False \ + # +actor_rollout_ref.actor.megatron.override_transformer_config.account_for_loss_in_pipeline_split=False \ + # +actor_rollout_ref.actor.megatron.override_transformer_config.num_layers_in_last_pipeline_stage=${last_layer} \ \ No newline at end of file diff --git a/recipe/fully_async_policy/exp/qwen3-32B_32/megatron_colocate/runtime_env.yaml b/recipe/fully_async_policy/exp/qwen3-32B_32/megatron_colocate/runtime_env.yaml new file mode 100644 index 00000000000..0d5684b1c73 --- /dev/null +++ b/recipe/fully_async_policy/exp/qwen3-32B_32/megatron_colocate/runtime_env.yaml @@ -0,0 +1,5 @@ +env_vars: + TENSORBOARD_DIR: "/mnt/dolphinfs/ssd_pool/docker/user/hadoop-friday-studio/FTI/houzhenggang/expeirments/qwen3-32B-32/dapo_qwen3-32B_32k_fsdp2_colocate_32_mbs32" + HYDRA_FULL_ERROR: "1" + TORCH_NCCL_AVOID_RECORD_STREAMS: "1" + CUDA_DEVICE_MAX_CONNECTIONS: "1" \ No newline at end of file diff --git a/verl/trainer/ppo/ray_trainer.py b/verl/trainer/ppo/ray_trainer.py index 42c728fa79d..2d5a0538616 100644 --- a/verl/trainer/ppo/ray_trainer.py +++ b/verl/trainer/ppo/ray_trainer.py @@ -1247,9 +1247,11 @@ def _process_batch_common(self, batch, metrics, timing_raw): reward_tensor, reward_extra_infos_dict = compute_reward(batch, self.reward_fn) # recompute old_log_probs with marked_timer("old_log_prob", timing_raw, color="blue"): - if self.config.async_training and self.config.async_training.use_rollout_log_probs: + async_training = self.config.get("async_training", None) + if async_training and async_training.use_rollout_log_prob: batch.batch["old_log_probs"] = batch.batch["rollout_log_probs"] - del actor_old_log_probs + batch.meta_info["temperature"] = self.config.actor_rollout_ref.rollout.temperature + else: old_log_prob = self.actor_rollout_wg.compute_log_prob(batch) From 363d12d1a0517029d380c3d0001dfe1b3c2f5bee Mon Sep 17 00:00:00 2001 From: wangshulin02 <953550366@qq.com> Date: Fri, 5 Sep 2025 22:11:01 +0800 Subject: [PATCH 103/182] exp shell file colocate done --- .../early_megatron_colocate.sh | 159 ++++++++++++++++++ .../megatron_colocate/runtime_env.yaml | 5 + .../early_megatron_colocate.sh | 159 ++++++++++++++++++ .../megatron_colocate/runtime_env.yaml | 5 + .../early_megatron_colocate.sh | 159 ++++++++++++++++++ .../megatron_colocate/runtime_env.yaml | 5 + .../early_megatron_colocate.sh | 154 +++++++++++++++++ .../megatron_colocate/runtime_env.yaml | 5 + .../megatron_colocate/runtime_env.yaml | 2 +- .../early_megatron_colocate.sh | 154 +++++++++++++++++ .../megatron_colocate/runtime_env.yaml | 5 + 11 files changed, 811 insertions(+), 1 deletion(-) create mode 100644 recipe/fully_async_policy/exp/qwen3-30BA3B_128/megatron_colocate/early_megatron_colocate.sh create mode 100644 recipe/fully_async_policy/exp/qwen3-30BA3B_128/megatron_colocate/runtime_env.yaml create mode 100644 recipe/fully_async_policy/exp/qwen3-30BA3B_32/megatron_colocate/early_megatron_colocate.sh create mode 100644 recipe/fully_async_policy/exp/qwen3-30BA3B_32/megatron_colocate/runtime_env.yaml create mode 100644 recipe/fully_async_policy/exp/qwen3-30BA3B_64/megatron_colocate/early_megatron_colocate.sh create mode 100644 recipe/fully_async_policy/exp/qwen3-30BA3B_64/megatron_colocate/runtime_env.yaml create mode 100644 recipe/fully_async_policy/exp/qwen3-32B_128/megatron_colocate/early_megatron_colocate.sh create mode 100644 recipe/fully_async_policy/exp/qwen3-32B_128/megatron_colocate/runtime_env.yaml create mode 100644 recipe/fully_async_policy/exp/qwen3-32B_64/megatron_colocate/early_megatron_colocate.sh create mode 100644 recipe/fully_async_policy/exp/qwen3-32B_64/megatron_colocate/runtime_env.yaml diff --git a/recipe/fully_async_policy/exp/qwen3-30BA3B_128/megatron_colocate/early_megatron_colocate.sh b/recipe/fully_async_policy/exp/qwen3-30BA3B_128/megatron_colocate/early_megatron_colocate.sh new file mode 100644 index 00000000000..26507694635 --- /dev/null +++ b/recipe/fully_async_policy/exp/qwen3-30BA3B_128/megatron_colocate/early_megatron_colocate.sh @@ -0,0 +1,159 @@ +#!/usr/bin/env bash +set -xeuo pipefail + +project_name='DAPO' +exp_name='dapo_qwen3-30BA3B_32k_megatron_colocate_128_mbs32' + +adv_estimator=grpo + +use_kl_in_reward=False +kl_coef=0.0 +use_kl_loss=False +kl_loss_coef=0.0 + +clip_ratio_low=0.2 +clip_ratio_high=0.28 + +max_prompt_length=$((1024 * 2)) +max_response_length=$((1024 * 32)) +enable_overlong_buffer=True +overlong_buffer_len=$((1024 * 4)) +overlong_penalty_factor=1.0 + +loss_agg_mode="token-mean" + +train_prompt_bsz=512 +train_prompt_mini_bsz=32 +n_resp_per_prompt=16 + +NNODES=${NNODES:-16} +NGPUS_PER_NODE=${NGPUS_PER_NODE:-8} +# Paths +RAY_DATA_HOME=${RAY_DATA_HOME:-"${HOME}/verl"} +MODEL_PATH=/mnt/dolphinfs/ssd_pool/docker/user/hadoop-friday-studio/.friday/models/basemodel/Qwen/Qwen3-30B-A3B +CKPTS_DIR=./ckpts/${project_name}/${exp_name} +TRAIN_FILE=/mnt/dolphinfs/ssd_pool/docker/user/hadoop-friday-studio/FTI/houzhenggang/data/dapo/dapo-math-17k.parquet +TEST_FILE=/mnt/dolphinfs/ssd_pool/docker/user/hadoop-friday-studio/FTI/houzhenggang/data/dapo/aime-2024.parquet + +# Algorithm +temperature=1.0 +top_p=1.0 +top_k=-1 # 0 for HF rollout, -1 for vLLM rollout +val_top_p=0.7 + + +# Performance Related Parameter +use_dynamic_bsz=True +actor_ppo_max_token_len=$((max_prompt_length + max_response_length)) +infer_ppo_max_token_len=$((max_prompt_length + max_response_length)) +offload=True +gen_tp=4 +train_tp=1 +train_pp=1 +EP=8 +ETP=1 +CP=1 + +python3 -m verl.trainer.main_ppo \ + --config-path=config \ + --config-name='ppo_megatron_trainer.yaml' \ + data.train_files="${TRAIN_FILE}" \ + data.val_files="${TEST_FILE}" \ + data.prompt_key=prompt \ + data.truncation='left' \ + data.max_prompt_length=${max_prompt_length} \ + data.max_response_length=${max_response_length} \ + data.train_batch_size=${train_prompt_bsz} \ + actor_rollout_ref.rollout.n=${n_resp_per_prompt} \ + algorithm.adv_estimator=${adv_estimator} \ + algorithm.use_kl_in_reward=${use_kl_in_reward} \ + algorithm.kl_ctrl.kl_coef=${kl_coef} \ + actor_rollout_ref.actor.use_kl_loss=${use_kl_loss} \ + actor_rollout_ref.actor.kl_loss_coef=${kl_loss_coef} \ + actor_rollout_ref.actor.clip_ratio_low=${clip_ratio_low} \ + actor_rollout_ref.actor.clip_ratio_high=${clip_ratio_high} \ + actor_rollout_ref.actor.clip_ratio_c=10.0 \ + actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=2 \ + actor_rollout_ref.ref.log_prob_micro_batch_size_per_gpu=4 \ + actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu=4 \ + actor_rollout_ref.model.path="${MODEL_PATH}" \ + actor_rollout_ref.actor.optim.lr=1e-6 \ + actor_rollout_ref.actor.optim.lr_warmup_steps=10 \ + actor_rollout_ref.actor.optim.weight_decay=0.1 \ + actor_rollout_ref.actor.ppo_mini_batch_size=${train_prompt_mini_bsz} \ + actor_rollout_ref.actor.entropy_coeff=0 \ + actor_rollout_ref.actor.optim.clip_grad=1.0 \ + actor_rollout_ref.actor.loss_agg_mode=${loss_agg_mode} \ + actor_rollout_ref.rollout.gpu_memory_utilization=0.80 \ + actor_rollout_ref.rollout.tensor_model_parallel_size=${gen_tp} \ + actor_rollout_ref.rollout.enable_chunked_prefill=True \ + actor_rollout_ref.rollout.max_num_batched_tokens=$((max_prompt_length + max_response_length)) \ + actor_rollout_ref.rollout.temperature=${temperature} \ + actor_rollout_ref.rollout.top_p=${top_p} \ + actor_rollout_ref.rollout.top_k=${top_k} \ + actor_rollout_ref.rollout.val_kwargs.temperature=${temperature} \ + actor_rollout_ref.rollout.val_kwargs.top_p=${val_top_p} \ + actor_rollout_ref.rollout.val_kwargs.top_k=${top_k} \ + actor_rollout_ref.rollout.val_kwargs.do_sample=True \ + actor_rollout_ref.rollout.val_kwargs.n=1 \ + reward_model.reward_manager=dapo \ + +reward_model.reward_kwargs.overlong_buffer_cfg.enable=${enable_overlong_buffer} \ + +reward_model.reward_kwargs.overlong_buffer_cfg.len=${overlong_buffer_len} \ + +reward_model.reward_kwargs.overlong_buffer_cfg.penalty_factor=${overlong_penalty_factor} \ + +reward_model.reward_kwargs.overlong_buffer_cfg.log=False \ + +reward_model.reward_kwargs.max_resp_len=${max_response_length} \ + trainer.logger='["console","tensorboard"]' \ + trainer.project_name="${project_name}" \ + trainer.experiment_name="${exp_name}" \ + trainer.n_gpus_per_node="${NGPUS_PER_NODE}" \ + trainer.nnodes="${NNODES}" \ + trainer.val_before_train=True \ + trainer.test_freq=20 \ + trainer.save_freq=-1 \ + trainer.total_epochs=10 \ + trainer.total_training_steps=400 \ + trainer.default_local_dir="${CKPTS_DIR}" \ + trainer.resume_mode=auto \ + trainer.log_val_generations=10 \ + actor_rollout_ref.actor.megatron.param_offload=${offload} \ + actor_rollout_ref.actor.megatron.optimizer_offload=${offload} \ + actor_rollout_ref.actor.megatron.grad_offload=${offload} \ + actor_rollout_ref.actor.megatron.pipeline_model_parallel_size=${train_pp} \ + actor_rollout_ref.actor.megatron.tensor_model_parallel_size=${train_tp} \ + actor_rollout_ref.actor.megatron.expert_model_parallel_size=${EP} \ + actor_rollout_ref.actor.megatron.expert_tensor_parallel_size=${ETP} \ + actor_rollout_ref.actor.megatron.context_parallel_size=${CP} \ + +actor_rollout_ref.actor.megatron.override_transformer_config.apply_rope_fusion=True \ + +actor_rollout_ref.actor.megatron.override_transformer_config.moe_router_dtype=fp32 \ + +actor_rollout_ref.actor.megatron.override_transformer_config.moe_shared_expert_overlap=False \ + +actor_rollout_ref.actor.megatron.override_transformer_config.moe_enable_deepep=True \ + +actor_rollout_ref.actor.megatron.override_transformer_config.moe_token_dispatcher_type=flex \ + +actor_rollout_ref.actor.megatron.override_transformer_config.moe_permute_fusion=True \ + +actor_rollout_ref.actor.megatron.override_transformer_config.recompute_granularity=selective \ + +actor_rollout_ref.actor.megatron.override_transformer_config.recompute_modules=["core_attn","moe_act","layernorm","mlp","moe"] \ + +actor_rollout_ref.actor.megatron.override_transformer_config.gradient_accumulation_fusion=True \ + +actor_rollout_ref.actor.megatron.override_transformer_config.account_for_embedding_in_pipeline_split=False \ + +actor_rollout_ref.actor.megatron.override_transformer_config.account_for_loss_in_pipeline_split=False \ + actor_rollout_ref.ref.megatron.param_offload=${offload} \ + actor_rollout_ref.ref.megatron.pipeline_model_parallel_size=${train_pp} \ + actor_rollout_ref.ref.megatron.tensor_model_parallel_size=${train_tp} \ + actor_rollout_ref.ref.megatron.expert_model_parallel_size=${EP} \ + actor_rollout_ref.ref.megatron.expert_tensor_parallel_size=${ETP} \ + actor_rollout_ref.ref.megatron.context_parallel_size=${CP} \ + actor_rollout_ref.actor.megatron.use_mbridge=True + + # +actor_rollout_ref.actor.megatron.override_transformer_config.apply_rope_fusion=False \ + # +actor_rollout_ref.actor.megatron.override_transformer_config.moe_router_dtype=fp32 \ + # +actor_rollout_ref.actor.megatron.override_transformer_config.moe_shared_expert_overlap=False \ + # +actor_rollout_ref.actor.megatron.override_transformer_config.moe_enable_deepep=False \ + # +actor_rollout_ref.actor.megatron.override_transformer_config.moe_token_dispatcher_type=alltoall \ + # +actor_rollout_ref.actor.megatron.override_transformer_config.recompute_method=uniform \ + # +actor_rollout_ref.actor.megatron.override_transformer_config.recompute_granularity=full \ + # actor_rollout_ref.actor.megatron.override_transformer_config.recompute_granularity="selective" \ + # actor_rollout_ref.actor.megatron.override_transformer_config.recompute_modules=["core_attn","moe_act","layernorm","mlp","moe"] \ + # +actor_rollout_ref.actor.megatron.override_transformer_config.recompute_num_layers=1 \ + # +actor_rollout_ref.actor.megatron.override_transformer_config.gradient_accumulation_fusion=True \ + # +actor_rollout_ref.actor.megatron.override_transformer_config.moe_permute_fusion=True \ + # +actor_rollout_ref.actor.megatron.override_transformer_config.account_for_embedding_in_pipeline_split=False \ + # +actor_rollout_ref.actor.megatron.override_transformer_config.account_for_loss_in_pipeline_split=False \ + # +actor_rollout_ref.actor.megatron.override_transformer_config.num_layers_in_last_pipeline_stage=${last_layer} \ \ No newline at end of file diff --git a/recipe/fully_async_policy/exp/qwen3-30BA3B_128/megatron_colocate/runtime_env.yaml b/recipe/fully_async_policy/exp/qwen3-30BA3B_128/megatron_colocate/runtime_env.yaml new file mode 100644 index 00000000000..4a714f40f43 --- /dev/null +++ b/recipe/fully_async_policy/exp/qwen3-30BA3B_128/megatron_colocate/runtime_env.yaml @@ -0,0 +1,5 @@ +env_vars: + TENSORBOARD_DIR: "/mnt/dolphinfs/ssd_pool/docker/user/hadoop-friday-studio/FTI/houzhenggang/expeirments/qwen3-30BA3B-128/dapo_qwen3-30BA3B_32k_megatron_colocate_128_mbs32" + HYDRA_FULL_ERROR: "1" + TORCH_NCCL_AVOID_RECORD_STREAMS: "1" + CUDA_DEVICE_MAX_CONNECTIONS: "1" \ No newline at end of file diff --git a/recipe/fully_async_policy/exp/qwen3-30BA3B_32/megatron_colocate/early_megatron_colocate.sh b/recipe/fully_async_policy/exp/qwen3-30BA3B_32/megatron_colocate/early_megatron_colocate.sh new file mode 100644 index 00000000000..69e5a723e9b --- /dev/null +++ b/recipe/fully_async_policy/exp/qwen3-30BA3B_32/megatron_colocate/early_megatron_colocate.sh @@ -0,0 +1,159 @@ +#!/usr/bin/env bash +set -xeuo pipefail + +project_name='DAPO' +exp_name='dapo_qwen3-30BA3B_32k_megatron_colocate_32_mbs32' + +adv_estimator=grpo + +use_kl_in_reward=False +kl_coef=0.0 +use_kl_loss=False +kl_loss_coef=0.0 + +clip_ratio_low=0.2 +clip_ratio_high=0.28 + +max_prompt_length=$((1024 * 2)) +max_response_length=$((1024 * 32)) +enable_overlong_buffer=True +overlong_buffer_len=$((1024 * 4)) +overlong_penalty_factor=1.0 + +loss_agg_mode="token-mean" + +train_prompt_bsz=512 +train_prompt_mini_bsz=32 +n_resp_per_prompt=16 + +NNODES=${NNODES:-4} +NGPUS_PER_NODE=${NGPUS_PER_NODE:-8} +# Paths +RAY_DATA_HOME=${RAY_DATA_HOME:-"${HOME}/verl"} +MODEL_PATH=/mnt/dolphinfs/ssd_pool/docker/user/hadoop-friday-studio/.friday/models/basemodel/Qwen/Qwen3-30B-A3B +CKPTS_DIR=./ckpts/${project_name}/${exp_name} +TRAIN_FILE=/mnt/dolphinfs/ssd_pool/docker/user/hadoop-friday-studio/FTI/houzhenggang/data/dapo/dapo-math-17k.parquet +TEST_FILE=/mnt/dolphinfs/ssd_pool/docker/user/hadoop-friday-studio/FTI/houzhenggang/data/dapo/aime-2024.parquet + +# Algorithm +temperature=1.0 +top_p=1.0 +top_k=-1 # 0 for HF rollout, -1 for vLLM rollout +val_top_p=0.7 + + +# Performance Related Parameter +use_dynamic_bsz=True +actor_ppo_max_token_len=$((max_prompt_length + max_response_length)) +infer_ppo_max_token_len=$((max_prompt_length + max_response_length)) +offload=True +gen_tp=4 +train_tp=1 +train_pp=1 +EP=8 +ETP=1 +CP=1 + +python3 -m verl.trainer.main_ppo \ + --config-path=config \ + --config-name='ppo_megatron_trainer.yaml' \ + data.train_files="${TRAIN_FILE}" \ + data.val_files="${TEST_FILE}" \ + data.prompt_key=prompt \ + data.truncation='left' \ + data.max_prompt_length=${max_prompt_length} \ + data.max_response_length=${max_response_length} \ + data.train_batch_size=${train_prompt_bsz} \ + actor_rollout_ref.rollout.n=${n_resp_per_prompt} \ + algorithm.adv_estimator=${adv_estimator} \ + algorithm.use_kl_in_reward=${use_kl_in_reward} \ + algorithm.kl_ctrl.kl_coef=${kl_coef} \ + actor_rollout_ref.actor.use_kl_loss=${use_kl_loss} \ + actor_rollout_ref.actor.kl_loss_coef=${kl_loss_coef} \ + actor_rollout_ref.actor.clip_ratio_low=${clip_ratio_low} \ + actor_rollout_ref.actor.clip_ratio_high=${clip_ratio_high} \ + actor_rollout_ref.actor.clip_ratio_c=10.0 \ + actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=2 \ + actor_rollout_ref.ref.log_prob_micro_batch_size_per_gpu=4 \ + actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu=4 \ + actor_rollout_ref.model.path="${MODEL_PATH}" \ + actor_rollout_ref.actor.optim.lr=1e-6 \ + actor_rollout_ref.actor.optim.lr_warmup_steps=10 \ + actor_rollout_ref.actor.optim.weight_decay=0.1 \ + actor_rollout_ref.actor.ppo_mini_batch_size=${train_prompt_mini_bsz} \ + actor_rollout_ref.actor.entropy_coeff=0 \ + actor_rollout_ref.actor.optim.clip_grad=1.0 \ + actor_rollout_ref.actor.loss_agg_mode=${loss_agg_mode} \ + actor_rollout_ref.rollout.gpu_memory_utilization=0.80 \ + actor_rollout_ref.rollout.tensor_model_parallel_size=${gen_tp} \ + actor_rollout_ref.rollout.enable_chunked_prefill=True \ + actor_rollout_ref.rollout.max_num_batched_tokens=$((max_prompt_length + max_response_length)) \ + actor_rollout_ref.rollout.temperature=${temperature} \ + actor_rollout_ref.rollout.top_p=${top_p} \ + actor_rollout_ref.rollout.top_k=${top_k} \ + actor_rollout_ref.rollout.val_kwargs.temperature=${temperature} \ + actor_rollout_ref.rollout.val_kwargs.top_p=${val_top_p} \ + actor_rollout_ref.rollout.val_kwargs.top_k=${top_k} \ + actor_rollout_ref.rollout.val_kwargs.do_sample=True \ + actor_rollout_ref.rollout.val_kwargs.n=1 \ + reward_model.reward_manager=dapo \ + +reward_model.reward_kwargs.overlong_buffer_cfg.enable=${enable_overlong_buffer} \ + +reward_model.reward_kwargs.overlong_buffer_cfg.len=${overlong_buffer_len} \ + +reward_model.reward_kwargs.overlong_buffer_cfg.penalty_factor=${overlong_penalty_factor} \ + +reward_model.reward_kwargs.overlong_buffer_cfg.log=False \ + +reward_model.reward_kwargs.max_resp_len=${max_response_length} \ + trainer.logger='["console","tensorboard"]' \ + trainer.project_name="${project_name}" \ + trainer.experiment_name="${exp_name}" \ + trainer.n_gpus_per_node="${NGPUS_PER_NODE}" \ + trainer.nnodes="${NNODES}" \ + trainer.val_before_train=True \ + trainer.test_freq=20 \ + trainer.save_freq=-1 \ + trainer.total_epochs=10 \ + trainer.total_training_steps=400 \ + trainer.default_local_dir="${CKPTS_DIR}" \ + trainer.resume_mode=auto \ + trainer.log_val_generations=10 \ + actor_rollout_ref.actor.megatron.param_offload=${offload} \ + actor_rollout_ref.actor.megatron.optimizer_offload=${offload} \ + actor_rollout_ref.actor.megatron.grad_offload=${offload} \ + actor_rollout_ref.actor.megatron.pipeline_model_parallel_size=${train_pp} \ + actor_rollout_ref.actor.megatron.tensor_model_parallel_size=${train_tp} \ + actor_rollout_ref.actor.megatron.expert_model_parallel_size=${EP} \ + actor_rollout_ref.actor.megatron.expert_tensor_parallel_size=${ETP} \ + actor_rollout_ref.actor.megatron.context_parallel_size=${CP} \ + +actor_rollout_ref.actor.megatron.override_transformer_config.apply_rope_fusion=True \ + +actor_rollout_ref.actor.megatron.override_transformer_config.moe_router_dtype=fp32 \ + +actor_rollout_ref.actor.megatron.override_transformer_config.moe_shared_expert_overlap=False \ + +actor_rollout_ref.actor.megatron.override_transformer_config.moe_enable_deepep=True \ + +actor_rollout_ref.actor.megatron.override_transformer_config.moe_token_dispatcher_type=flex \ + +actor_rollout_ref.actor.megatron.override_transformer_config.moe_permute_fusion=True \ + +actor_rollout_ref.actor.megatron.override_transformer_config.recompute_granularity=selective \ + +actor_rollout_ref.actor.megatron.override_transformer_config.recompute_modules=["core_attn","moe_act","layernorm","mlp","moe"] \ + +actor_rollout_ref.actor.megatron.override_transformer_config.gradient_accumulation_fusion=True \ + +actor_rollout_ref.actor.megatron.override_transformer_config.account_for_embedding_in_pipeline_split=False \ + +actor_rollout_ref.actor.megatron.override_transformer_config.account_for_loss_in_pipeline_split=False \ + actor_rollout_ref.ref.megatron.param_offload=${offload} \ + actor_rollout_ref.ref.megatron.pipeline_model_parallel_size=${train_pp} \ + actor_rollout_ref.ref.megatron.tensor_model_parallel_size=${train_tp} \ + actor_rollout_ref.ref.megatron.expert_model_parallel_size=${EP} \ + actor_rollout_ref.ref.megatron.expert_tensor_parallel_size=${ETP} \ + actor_rollout_ref.ref.megatron.context_parallel_size=${CP} \ + actor_rollout_ref.actor.megatron.use_mbridge=True + + # +actor_rollout_ref.actor.megatron.override_transformer_config.apply_rope_fusion=False \ + # +actor_rollout_ref.actor.megatron.override_transformer_config.moe_router_dtype=fp32 \ + # +actor_rollout_ref.actor.megatron.override_transformer_config.moe_shared_expert_overlap=False \ + # +actor_rollout_ref.actor.megatron.override_transformer_config.moe_enable_deepep=False \ + # +actor_rollout_ref.actor.megatron.override_transformer_config.moe_token_dispatcher_type=alltoall \ + # +actor_rollout_ref.actor.megatron.override_transformer_config.recompute_method=uniform \ + # +actor_rollout_ref.actor.megatron.override_transformer_config.recompute_granularity=full \ + # actor_rollout_ref.actor.megatron.override_transformer_config.recompute_granularity="selective" \ + # actor_rollout_ref.actor.megatron.override_transformer_config.recompute_modules=["core_attn","moe_act","layernorm","mlp","moe"] \ + # +actor_rollout_ref.actor.megatron.override_transformer_config.recompute_num_layers=1 \ + # +actor_rollout_ref.actor.megatron.override_transformer_config.gradient_accumulation_fusion=True \ + # +actor_rollout_ref.actor.megatron.override_transformer_config.moe_permute_fusion=True \ + # +actor_rollout_ref.actor.megatron.override_transformer_config.account_for_embedding_in_pipeline_split=False \ + # +actor_rollout_ref.actor.megatron.override_transformer_config.account_for_loss_in_pipeline_split=False \ + # +actor_rollout_ref.actor.megatron.override_transformer_config.num_layers_in_last_pipeline_stage=${last_layer} \ \ No newline at end of file diff --git a/recipe/fully_async_policy/exp/qwen3-30BA3B_32/megatron_colocate/runtime_env.yaml b/recipe/fully_async_policy/exp/qwen3-30BA3B_32/megatron_colocate/runtime_env.yaml new file mode 100644 index 00000000000..052557120ad --- /dev/null +++ b/recipe/fully_async_policy/exp/qwen3-30BA3B_32/megatron_colocate/runtime_env.yaml @@ -0,0 +1,5 @@ +env_vars: + TENSORBOARD_DIR: "/mnt/dolphinfs/ssd_pool/docker/user/hadoop-friday-studio/FTI/houzhenggang/expeirments/qwen3-30BA3B-32/dapo_qwen3-30BA3B_32k_megatron_colocate_32_mbs32" + HYDRA_FULL_ERROR: "1" + TORCH_NCCL_AVOID_RECORD_STREAMS: "1" + CUDA_DEVICE_MAX_CONNECTIONS: "1" \ No newline at end of file diff --git a/recipe/fully_async_policy/exp/qwen3-30BA3B_64/megatron_colocate/early_megatron_colocate.sh b/recipe/fully_async_policy/exp/qwen3-30BA3B_64/megatron_colocate/early_megatron_colocate.sh new file mode 100644 index 00000000000..8e632a9dbfb --- /dev/null +++ b/recipe/fully_async_policy/exp/qwen3-30BA3B_64/megatron_colocate/early_megatron_colocate.sh @@ -0,0 +1,159 @@ +#!/usr/bin/env bash +set -xeuo pipefail + +project_name='DAPO' +exp_name='dapo_qwen3-30BA3B_32k_megatron_colocate_64_mbs32' + +adv_estimator=grpo + +use_kl_in_reward=False +kl_coef=0.0 +use_kl_loss=False +kl_loss_coef=0.0 + +clip_ratio_low=0.2 +clip_ratio_high=0.28 + +max_prompt_length=$((1024 * 2)) +max_response_length=$((1024 * 32)) +enable_overlong_buffer=True +overlong_buffer_len=$((1024 * 4)) +overlong_penalty_factor=1.0 + +loss_agg_mode="token-mean" + +train_prompt_bsz=512 +train_prompt_mini_bsz=32 +n_resp_per_prompt=16 + +NNODES=${NNODES:-8} +NGPUS_PER_NODE=${NGPUS_PER_NODE:-8} +# Paths +RAY_DATA_HOME=${RAY_DATA_HOME:-"${HOME}/verl"} +MODEL_PATH=/mnt/dolphinfs/ssd_pool/docker/user/hadoop-friday-studio/.friday/models/basemodel/Qwen/Qwen3-30B-A3B +CKPTS_DIR=./ckpts/${project_name}/${exp_name} +TRAIN_FILE=/mnt/dolphinfs/ssd_pool/docker/user/hadoop-friday-studio/FTI/houzhenggang/data/dapo/dapo-math-17k.parquet +TEST_FILE=/mnt/dolphinfs/ssd_pool/docker/user/hadoop-friday-studio/FTI/houzhenggang/data/dapo/aime-2024.parquet + +# Algorithm +temperature=1.0 +top_p=1.0 +top_k=-1 # 0 for HF rollout, -1 for vLLM rollout +val_top_p=0.7 + + +# Performance Related Parameter +use_dynamic_bsz=True +actor_ppo_max_token_len=$((max_prompt_length + max_response_length)) +infer_ppo_max_token_len=$((max_prompt_length + max_response_length)) +offload=True +gen_tp=4 +train_tp=1 +train_pp=1 +EP=8 +ETP=1 +CP=1 + +python3 -m verl.trainer.main_ppo \ + --config-path=config \ + --config-name='ppo_megatron_trainer.yaml' \ + data.train_files="${TRAIN_FILE}" \ + data.val_files="${TEST_FILE}" \ + data.prompt_key=prompt \ + data.truncation='left' \ + data.max_prompt_length=${max_prompt_length} \ + data.max_response_length=${max_response_length} \ + data.train_batch_size=${train_prompt_bsz} \ + actor_rollout_ref.rollout.n=${n_resp_per_prompt} \ + algorithm.adv_estimator=${adv_estimator} \ + algorithm.use_kl_in_reward=${use_kl_in_reward} \ + algorithm.kl_ctrl.kl_coef=${kl_coef} \ + actor_rollout_ref.actor.use_kl_loss=${use_kl_loss} \ + actor_rollout_ref.actor.kl_loss_coef=${kl_loss_coef} \ + actor_rollout_ref.actor.clip_ratio_low=${clip_ratio_low} \ + actor_rollout_ref.actor.clip_ratio_high=${clip_ratio_high} \ + actor_rollout_ref.actor.clip_ratio_c=10.0 \ + actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=2 \ + actor_rollout_ref.ref.log_prob_micro_batch_size_per_gpu=4 \ + actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu=4 \ + actor_rollout_ref.model.path="${MODEL_PATH}" \ + actor_rollout_ref.actor.optim.lr=1e-6 \ + actor_rollout_ref.actor.optim.lr_warmup_steps=10 \ + actor_rollout_ref.actor.optim.weight_decay=0.1 \ + actor_rollout_ref.actor.ppo_mini_batch_size=${train_prompt_mini_bsz} \ + actor_rollout_ref.actor.entropy_coeff=0 \ + actor_rollout_ref.actor.optim.clip_grad=1.0 \ + actor_rollout_ref.actor.loss_agg_mode=${loss_agg_mode} \ + actor_rollout_ref.rollout.gpu_memory_utilization=0.80 \ + actor_rollout_ref.rollout.tensor_model_parallel_size=${gen_tp} \ + actor_rollout_ref.rollout.enable_chunked_prefill=True \ + actor_rollout_ref.rollout.max_num_batched_tokens=$((max_prompt_length + max_response_length)) \ + actor_rollout_ref.rollout.temperature=${temperature} \ + actor_rollout_ref.rollout.top_p=${top_p} \ + actor_rollout_ref.rollout.top_k=${top_k} \ + actor_rollout_ref.rollout.val_kwargs.temperature=${temperature} \ + actor_rollout_ref.rollout.val_kwargs.top_p=${val_top_p} \ + actor_rollout_ref.rollout.val_kwargs.top_k=${top_k} \ + actor_rollout_ref.rollout.val_kwargs.do_sample=True \ + actor_rollout_ref.rollout.val_kwargs.n=1 \ + reward_model.reward_manager=dapo \ + +reward_model.reward_kwargs.overlong_buffer_cfg.enable=${enable_overlong_buffer} \ + +reward_model.reward_kwargs.overlong_buffer_cfg.len=${overlong_buffer_len} \ + +reward_model.reward_kwargs.overlong_buffer_cfg.penalty_factor=${overlong_penalty_factor} \ + +reward_model.reward_kwargs.overlong_buffer_cfg.log=False \ + +reward_model.reward_kwargs.max_resp_len=${max_response_length} \ + trainer.logger='["console","tensorboard"]' \ + trainer.project_name="${project_name}" \ + trainer.experiment_name="${exp_name}" \ + trainer.n_gpus_per_node="${NGPUS_PER_NODE}" \ + trainer.nnodes="${NNODES}" \ + trainer.val_before_train=True \ + trainer.test_freq=20 \ + trainer.save_freq=-1 \ + trainer.total_epochs=10 \ + trainer.total_training_steps=400 \ + trainer.default_local_dir="${CKPTS_DIR}" \ + trainer.resume_mode=auto \ + trainer.log_val_generations=10 \ + actor_rollout_ref.actor.megatron.param_offload=${offload} \ + actor_rollout_ref.actor.megatron.optimizer_offload=${offload} \ + actor_rollout_ref.actor.megatron.grad_offload=${offload} \ + actor_rollout_ref.actor.megatron.pipeline_model_parallel_size=${train_pp} \ + actor_rollout_ref.actor.megatron.tensor_model_parallel_size=${train_tp} \ + actor_rollout_ref.actor.megatron.expert_model_parallel_size=${EP} \ + actor_rollout_ref.actor.megatron.expert_tensor_parallel_size=${ETP} \ + actor_rollout_ref.actor.megatron.context_parallel_size=${CP} \ + +actor_rollout_ref.actor.megatron.override_transformer_config.apply_rope_fusion=True \ + +actor_rollout_ref.actor.megatron.override_transformer_config.moe_router_dtype=fp32 \ + +actor_rollout_ref.actor.megatron.override_transformer_config.moe_shared_expert_overlap=False \ + +actor_rollout_ref.actor.megatron.override_transformer_config.moe_enable_deepep=True \ + +actor_rollout_ref.actor.megatron.override_transformer_config.moe_token_dispatcher_type=flex \ + +actor_rollout_ref.actor.megatron.override_transformer_config.moe_permute_fusion=True \ + +actor_rollout_ref.actor.megatron.override_transformer_config.recompute_granularity=selective \ + +actor_rollout_ref.actor.megatron.override_transformer_config.recompute_modules=["core_attn","moe_act","layernorm","mlp","moe"] \ + +actor_rollout_ref.actor.megatron.override_transformer_config.gradient_accumulation_fusion=True \ + +actor_rollout_ref.actor.megatron.override_transformer_config.account_for_embedding_in_pipeline_split=False \ + +actor_rollout_ref.actor.megatron.override_transformer_config.account_for_loss_in_pipeline_split=False \ + actor_rollout_ref.ref.megatron.param_offload=${offload} \ + actor_rollout_ref.ref.megatron.pipeline_model_parallel_size=${train_pp} \ + actor_rollout_ref.ref.megatron.tensor_model_parallel_size=${train_tp} \ + actor_rollout_ref.ref.megatron.expert_model_parallel_size=${EP} \ + actor_rollout_ref.ref.megatron.expert_tensor_parallel_size=${ETP} \ + actor_rollout_ref.ref.megatron.context_parallel_size=${CP} \ + actor_rollout_ref.actor.megatron.use_mbridge=True + + # +actor_rollout_ref.actor.megatron.override_transformer_config.apply_rope_fusion=False \ + # +actor_rollout_ref.actor.megatron.override_transformer_config.moe_router_dtype=fp32 \ + # +actor_rollout_ref.actor.megatron.override_transformer_config.moe_shared_expert_overlap=False \ + # +actor_rollout_ref.actor.megatron.override_transformer_config.moe_enable_deepep=False \ + # +actor_rollout_ref.actor.megatron.override_transformer_config.moe_token_dispatcher_type=alltoall \ + # +actor_rollout_ref.actor.megatron.override_transformer_config.recompute_method=uniform \ + # +actor_rollout_ref.actor.megatron.override_transformer_config.recompute_granularity=full \ + # actor_rollout_ref.actor.megatron.override_transformer_config.recompute_granularity="selective" \ + # actor_rollout_ref.actor.megatron.override_transformer_config.recompute_modules=["core_attn","moe_act","layernorm","mlp","moe"] \ + # +actor_rollout_ref.actor.megatron.override_transformer_config.recompute_num_layers=1 \ + # +actor_rollout_ref.actor.megatron.override_transformer_config.gradient_accumulation_fusion=True \ + # +actor_rollout_ref.actor.megatron.override_transformer_config.moe_permute_fusion=True \ + # +actor_rollout_ref.actor.megatron.override_transformer_config.account_for_embedding_in_pipeline_split=False \ + # +actor_rollout_ref.actor.megatron.override_transformer_config.account_for_loss_in_pipeline_split=False \ + # +actor_rollout_ref.actor.megatron.override_transformer_config.num_layers_in_last_pipeline_stage=${last_layer} \ \ No newline at end of file diff --git a/recipe/fully_async_policy/exp/qwen3-30BA3B_64/megatron_colocate/runtime_env.yaml b/recipe/fully_async_policy/exp/qwen3-30BA3B_64/megatron_colocate/runtime_env.yaml new file mode 100644 index 00000000000..3a497e90dd0 --- /dev/null +++ b/recipe/fully_async_policy/exp/qwen3-30BA3B_64/megatron_colocate/runtime_env.yaml @@ -0,0 +1,5 @@ +env_vars: + TENSORBOARD_DIR: "/mnt/dolphinfs/ssd_pool/docker/user/hadoop-friday-studio/FTI/houzhenggang/expeirments/qwen3-30BA3B-64/dapo_qwen3-30BA3B_32k_megatron_colocate_64_mbs32" + HYDRA_FULL_ERROR: "1" + TORCH_NCCL_AVOID_RECORD_STREAMS: "1" + CUDA_DEVICE_MAX_CONNECTIONS: "1" \ No newline at end of file diff --git a/recipe/fully_async_policy/exp/qwen3-32B_128/megatron_colocate/early_megatron_colocate.sh b/recipe/fully_async_policy/exp/qwen3-32B_128/megatron_colocate/early_megatron_colocate.sh new file mode 100644 index 00000000000..3b9ce953d85 --- /dev/null +++ b/recipe/fully_async_policy/exp/qwen3-32B_128/megatron_colocate/early_megatron_colocate.sh @@ -0,0 +1,154 @@ +#!/usr/bin/env bash +set -xeuo pipefail + +project_name='DAPO' +exp_name='dapo_qwen3-32B_32k_megatron_colocate_128_mbs32' + +adv_estimator=grpo + +use_kl_in_reward=False +kl_coef=0.0 +use_kl_loss=False +kl_loss_coef=0.0 + +clip_ratio_low=0.2 +clip_ratio_high=0.28 + +max_prompt_length=$((1024 * 2)) +max_response_length=$((1024 * 32)) +enable_overlong_buffer=True +overlong_buffer_len=$((1024 * 4)) +overlong_penalty_factor=1.0 + +loss_agg_mode="token-mean" + +train_prompt_bsz=512 +train_prompt_mini_bsz=32 +n_resp_per_prompt=16 + +NNODES=${NNODES:-16} +NGPUS_PER_NODE=${NGPUS_PER_NODE:-8} +# Paths +RAY_DATA_HOME=${RAY_DATA_HOME:-"${HOME}/verl"} +MODEL_PATH=/mnt/dolphinfs/ssd_pool/docker/user/hadoop-friday-studio/.friday/models/basemodel/Qwen/Qwen3-32B +CKPTS_DIR=./ckpts/${project_name}/${exp_name} +TRAIN_FILE=/mnt/dolphinfs/ssd_pool/docker/user/hadoop-friday-studio/FTI/houzhenggang/data/dapo/dapo-math-17k.parquet +TEST_FILE=/mnt/dolphinfs/ssd_pool/docker/user/hadoop-friday-studio/FTI/houzhenggang/data/dapo/aime-2024.parquet + +# Algorithm +temperature=1.0 +top_p=1.0 +top_k=-1 # 0 for HF rollout, -1 for vLLM rollout +val_top_p=0.7 + + +# Performance Related Parameter +use_dynamic_bsz=True +actor_ppo_max_token_len=$((max_prompt_length + max_response_length)) +infer_ppo_max_token_len=$((max_prompt_length + max_response_length)) +offload=True +gen_tp=4 +train_tp=4 +train_pp=2 +EP=1 +ETP=1 +CP=1 + +python3 -m verl.trainer.main_ppo \ + --config-path=config \ + --config-name='ppo_megatron_trainer.yaml' \ + data.train_files="${TRAIN_FILE}" \ + data.val_files="${TEST_FILE}" \ + data.prompt_key=prompt \ + data.truncation='left' \ + data.max_prompt_length=${max_prompt_length} \ + data.max_response_length=${max_response_length} \ + data.train_batch_size=${train_prompt_bsz} \ + actor_rollout_ref.rollout.n=${n_resp_per_prompt} \ + algorithm.adv_estimator=${adv_estimator} \ + algorithm.use_kl_in_reward=${use_kl_in_reward} \ + algorithm.kl_ctrl.kl_coef=${kl_coef} \ + actor_rollout_ref.actor.use_kl_loss=${use_kl_loss} \ + actor_rollout_ref.actor.kl_loss_coef=${kl_loss_coef} \ + actor_rollout_ref.actor.clip_ratio_low=${clip_ratio_low} \ + actor_rollout_ref.actor.clip_ratio_high=${clip_ratio_high} \ + actor_rollout_ref.actor.clip_ratio_c=10.0 \ + actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=2 \ + actor_rollout_ref.ref.log_prob_micro_batch_size_per_gpu=4 \ + actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu=4 \ + actor_rollout_ref.model.path="${MODEL_PATH}" \ + actor_rollout_ref.actor.optim.lr=1e-6 \ + actor_rollout_ref.actor.optim.lr_warmup_steps=10 \ + actor_rollout_ref.actor.optim.weight_decay=0.1 \ + actor_rollout_ref.actor.ppo_mini_batch_size=${train_prompt_mini_bsz} \ + actor_rollout_ref.actor.entropy_coeff=0 \ + actor_rollout_ref.actor.optim.clip_grad=1.0 \ + actor_rollout_ref.actor.loss_agg_mode=${loss_agg_mode} \ + actor_rollout_ref.rollout.gpu_memory_utilization=0.80 \ + actor_rollout_ref.rollout.tensor_model_parallel_size=${gen_tp} \ + actor_rollout_ref.rollout.enable_chunked_prefill=True \ + actor_rollout_ref.rollout.max_num_batched_tokens=$((max_prompt_length + max_response_length)) \ + actor_rollout_ref.rollout.temperature=${temperature} \ + actor_rollout_ref.rollout.top_p=${top_p} \ + actor_rollout_ref.rollout.top_k=${top_k} \ + actor_rollout_ref.rollout.val_kwargs.temperature=${temperature} \ + actor_rollout_ref.rollout.val_kwargs.top_p=${val_top_p} \ + actor_rollout_ref.rollout.val_kwargs.top_k=${top_k} \ + actor_rollout_ref.rollout.val_kwargs.do_sample=True \ + actor_rollout_ref.rollout.val_kwargs.n=1 \ + reward_model.reward_manager=dapo \ + +reward_model.reward_kwargs.overlong_buffer_cfg.enable=${enable_overlong_buffer} \ + +reward_model.reward_kwargs.overlong_buffer_cfg.len=${overlong_buffer_len} \ + +reward_model.reward_kwargs.overlong_buffer_cfg.penalty_factor=${overlong_penalty_factor} \ + +reward_model.reward_kwargs.overlong_buffer_cfg.log=False \ + +reward_model.reward_kwargs.max_resp_len=${max_response_length} \ + trainer.logger='["console","tensorboard"]' \ + trainer.project_name="${project_name}" \ + trainer.experiment_name="${exp_name}" \ + trainer.n_gpus_per_node="${NGPUS_PER_NODE}" \ + trainer.nnodes="${NNODES}" \ + trainer.val_before_train=True \ + trainer.test_freq=20 \ + trainer.save_freq=-1 \ + trainer.total_epochs=10 \ + trainer.total_training_steps=400 \ + trainer.default_local_dir="${CKPTS_DIR}" \ + trainer.resume_mode=auto \ + trainer.log_val_generations=10 \ + actor_rollout_ref.actor.megatron.param_offload=${offload} \ + actor_rollout_ref.actor.megatron.optimizer_offload=${offload} \ + actor_rollout_ref.actor.megatron.grad_offload=${offload} \ + actor_rollout_ref.actor.megatron.pipeline_model_parallel_size=${train_pp} \ + actor_rollout_ref.actor.megatron.tensor_model_parallel_size=${train_tp} \ + actor_rollout_ref.actor.megatron.expert_model_parallel_size=${EP} \ + actor_rollout_ref.actor.megatron.expert_tensor_parallel_size=${ETP} \ + actor_rollout_ref.actor.megatron.context_parallel_size=${CP} \ + +actor_rollout_ref.actor.megatron.override_transformer_config.apply_rope_fusion=True \ + +actor_rollout_ref.actor.megatron.override_transformer_config.recompute_granularity=selective \ + +actor_rollout_ref.actor.megatron.override_transformer_config.recompute_modules=["core_attn","layernorm","mlp"] \ + +actor_rollout_ref.actor.megatron.override_transformer_config.gradient_accumulation_fusion=True \ + +actor_rollout_ref.actor.megatron.override_transformer_config.account_for_embedding_in_pipeline_split=False \ + +actor_rollout_ref.actor.megatron.override_transformer_config.account_for_loss_in_pipeline_split=False \ + actor_rollout_ref.ref.megatron.param_offload=${offload} \ + actor_rollout_ref.ref.megatron.pipeline_model_parallel_size=${train_pp} \ + actor_rollout_ref.ref.megatron.tensor_model_parallel_size=${train_tp} \ + actor_rollout_ref.ref.megatron.expert_model_parallel_size=${EP} \ + actor_rollout_ref.ref.megatron.expert_tensor_parallel_size=${ETP} \ + actor_rollout_ref.ref.megatron.context_parallel_size=${CP} \ + actor_rollout_ref.actor.megatron.use_mbridge=True + + # +actor_rollout_ref.actor.megatron.override_transformer_config.apply_rope_fusion=False \ + # +actor_rollout_ref.actor.megatron.override_transformer_config.moe_router_dtype=fp32 \ + # +actor_rollout_ref.actor.megatron.override_transformer_config.moe_shared_expert_overlap=False \ + # +actor_rollout_ref.actor.megatron.override_transformer_config.moe_enable_deepep=False \ + # +actor_rollout_ref.actor.megatron.override_transformer_config.moe_token_dispatcher_type=alltoall \ + # +actor_rollout_ref.actor.megatron.override_transformer_config.recompute_method=uniform \ + # +actor_rollout_ref.actor.megatron.override_transformer_config.recompute_granularity=full \ + # actor_rollout_ref.actor.megatron.override_transformer_config.recompute_granularity="selective" \ + # actor_rollout_ref.actor.megatron.override_transformer_config.recompute_modules=["core_attn","moe_act","layernorm","mlp","moe"] \ + # +actor_rollout_ref.actor.megatron.override_transformer_config.recompute_num_layers=1 \ + # +actor_rollout_ref.actor.megatron.override_transformer_config.gradient_accumulation_fusion=True \ + # +actor_rollout_ref.actor.megatron.override_transformer_config.moe_permute_fusion=True \ + # +actor_rollout_ref.actor.megatron.override_transformer_config.account_for_embedding_in_pipeline_split=False \ + # +actor_rollout_ref.actor.megatron.override_transformer_config.account_for_loss_in_pipeline_split=False \ + # +actor_rollout_ref.actor.megatron.override_transformer_config.num_layers_in_last_pipeline_stage=${last_layer} \ \ No newline at end of file diff --git a/recipe/fully_async_policy/exp/qwen3-32B_128/megatron_colocate/runtime_env.yaml b/recipe/fully_async_policy/exp/qwen3-32B_128/megatron_colocate/runtime_env.yaml new file mode 100644 index 00000000000..1bbc3faadc9 --- /dev/null +++ b/recipe/fully_async_policy/exp/qwen3-32B_128/megatron_colocate/runtime_env.yaml @@ -0,0 +1,5 @@ +env_vars: + TENSORBOARD_DIR: "/mnt/dolphinfs/ssd_pool/docker/user/hadoop-friday-studio/FTI/houzhenggang/expeirments/qwen3-32B-128/dapo_qwen3-32B_32k_megatron_colocate_128_mbs32" + HYDRA_FULL_ERROR: "1" + TORCH_NCCL_AVOID_RECORD_STREAMS: "1" + CUDA_DEVICE_MAX_CONNECTIONS: "1" \ No newline at end of file diff --git a/recipe/fully_async_policy/exp/qwen3-32B_32/megatron_colocate/runtime_env.yaml b/recipe/fully_async_policy/exp/qwen3-32B_32/megatron_colocate/runtime_env.yaml index 0d5684b1c73..2d0930d13ab 100644 --- a/recipe/fully_async_policy/exp/qwen3-32B_32/megatron_colocate/runtime_env.yaml +++ b/recipe/fully_async_policy/exp/qwen3-32B_32/megatron_colocate/runtime_env.yaml @@ -1,5 +1,5 @@ env_vars: - TENSORBOARD_DIR: "/mnt/dolphinfs/ssd_pool/docker/user/hadoop-friday-studio/FTI/houzhenggang/expeirments/qwen3-32B-32/dapo_qwen3-32B_32k_fsdp2_colocate_32_mbs32" + TENSORBOARD_DIR: "/mnt/dolphinfs/ssd_pool/docker/user/hadoop-friday-studio/FTI/houzhenggang/expeirments/qwen3-32B-32/dapo_qwen3-32B_32k_megatron_colocate_32_mbs32" HYDRA_FULL_ERROR: "1" TORCH_NCCL_AVOID_RECORD_STREAMS: "1" CUDA_DEVICE_MAX_CONNECTIONS: "1" \ No newline at end of file diff --git a/recipe/fully_async_policy/exp/qwen3-32B_64/megatron_colocate/early_megatron_colocate.sh b/recipe/fully_async_policy/exp/qwen3-32B_64/megatron_colocate/early_megatron_colocate.sh new file mode 100644 index 00000000000..280d3e19dbf --- /dev/null +++ b/recipe/fully_async_policy/exp/qwen3-32B_64/megatron_colocate/early_megatron_colocate.sh @@ -0,0 +1,154 @@ +#!/usr/bin/env bash +set -xeuo pipefail + +project_name='DAPO' +exp_name='dapo_qwen3-32B_32k_megatron_colocate_64_mbs32' + +adv_estimator=grpo + +use_kl_in_reward=False +kl_coef=0.0 +use_kl_loss=False +kl_loss_coef=0.0 + +clip_ratio_low=0.2 +clip_ratio_high=0.28 + +max_prompt_length=$((1024 * 2)) +max_response_length=$((1024 * 32)) +enable_overlong_buffer=True +overlong_buffer_len=$((1024 * 4)) +overlong_penalty_factor=1.0 + +loss_agg_mode="token-mean" + +train_prompt_bsz=512 +train_prompt_mini_bsz=32 +n_resp_per_prompt=16 + +NNODES=${NNODES:-8} +NGPUS_PER_NODE=${NGPUS_PER_NODE:-8} +# Paths +RAY_DATA_HOME=${RAY_DATA_HOME:-"${HOME}/verl"} +MODEL_PATH=/mnt/dolphinfs/ssd_pool/docker/user/hadoop-friday-studio/.friday/models/basemodel/Qwen/Qwen3-32B +CKPTS_DIR=./ckpts/${project_name}/${exp_name} +TRAIN_FILE=/mnt/dolphinfs/ssd_pool/docker/user/hadoop-friday-studio/FTI/houzhenggang/data/dapo/dapo-math-17k.parquet +TEST_FILE=/mnt/dolphinfs/ssd_pool/docker/user/hadoop-friday-studio/FTI/houzhenggang/data/dapo/aime-2024.parquet + +# Algorithm +temperature=1.0 +top_p=1.0 +top_k=-1 # 0 for HF rollout, -1 for vLLM rollout +val_top_p=0.7 + + +# Performance Related Parameter +use_dynamic_bsz=True +actor_ppo_max_token_len=$((max_prompt_length + max_response_length)) +infer_ppo_max_token_len=$((max_prompt_length + max_response_length)) +offload=True +gen_tp=4 +train_tp=4 +train_pp=2 +EP=1 +ETP=1 +CP=1 + +python3 -m verl.trainer.main_ppo \ + --config-path=config \ + --config-name='ppo_megatron_trainer.yaml' \ + data.train_files="${TRAIN_FILE}" \ + data.val_files="${TEST_FILE}" \ + data.prompt_key=prompt \ + data.truncation='left' \ + data.max_prompt_length=${max_prompt_length} \ + data.max_response_length=${max_response_length} \ + data.train_batch_size=${train_prompt_bsz} \ + actor_rollout_ref.rollout.n=${n_resp_per_prompt} \ + algorithm.adv_estimator=${adv_estimator} \ + algorithm.use_kl_in_reward=${use_kl_in_reward} \ + algorithm.kl_ctrl.kl_coef=${kl_coef} \ + actor_rollout_ref.actor.use_kl_loss=${use_kl_loss} \ + actor_rollout_ref.actor.kl_loss_coef=${kl_loss_coef} \ + actor_rollout_ref.actor.clip_ratio_low=${clip_ratio_low} \ + actor_rollout_ref.actor.clip_ratio_high=${clip_ratio_high} \ + actor_rollout_ref.actor.clip_ratio_c=10.0 \ + actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=2 \ + actor_rollout_ref.ref.log_prob_micro_batch_size_per_gpu=4 \ + actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu=4 \ + actor_rollout_ref.model.path="${MODEL_PATH}" \ + actor_rollout_ref.actor.optim.lr=1e-6 \ + actor_rollout_ref.actor.optim.lr_warmup_steps=10 \ + actor_rollout_ref.actor.optim.weight_decay=0.1 \ + actor_rollout_ref.actor.ppo_mini_batch_size=${train_prompt_mini_bsz} \ + actor_rollout_ref.actor.entropy_coeff=0 \ + actor_rollout_ref.actor.optim.clip_grad=1.0 \ + actor_rollout_ref.actor.loss_agg_mode=${loss_agg_mode} \ + actor_rollout_ref.rollout.gpu_memory_utilization=0.80 \ + actor_rollout_ref.rollout.tensor_model_parallel_size=${gen_tp} \ + actor_rollout_ref.rollout.enable_chunked_prefill=True \ + actor_rollout_ref.rollout.max_num_batched_tokens=$((max_prompt_length + max_response_length)) \ + actor_rollout_ref.rollout.temperature=${temperature} \ + actor_rollout_ref.rollout.top_p=${top_p} \ + actor_rollout_ref.rollout.top_k=${top_k} \ + actor_rollout_ref.rollout.val_kwargs.temperature=${temperature} \ + actor_rollout_ref.rollout.val_kwargs.top_p=${val_top_p} \ + actor_rollout_ref.rollout.val_kwargs.top_k=${top_k} \ + actor_rollout_ref.rollout.val_kwargs.do_sample=True \ + actor_rollout_ref.rollout.val_kwargs.n=1 \ + reward_model.reward_manager=dapo \ + +reward_model.reward_kwargs.overlong_buffer_cfg.enable=${enable_overlong_buffer} \ + +reward_model.reward_kwargs.overlong_buffer_cfg.len=${overlong_buffer_len} \ + +reward_model.reward_kwargs.overlong_buffer_cfg.penalty_factor=${overlong_penalty_factor} \ + +reward_model.reward_kwargs.overlong_buffer_cfg.log=False \ + +reward_model.reward_kwargs.max_resp_len=${max_response_length} \ + trainer.logger='["console","tensorboard"]' \ + trainer.project_name="${project_name}" \ + trainer.experiment_name="${exp_name}" \ + trainer.n_gpus_per_node="${NGPUS_PER_NODE}" \ + trainer.nnodes="${NNODES}" \ + trainer.val_before_train=True \ + trainer.test_freq=20 \ + trainer.save_freq=-1 \ + trainer.total_epochs=10 \ + trainer.total_training_steps=400 \ + trainer.default_local_dir="${CKPTS_DIR}" \ + trainer.resume_mode=auto \ + trainer.log_val_generations=10 \ + actor_rollout_ref.actor.megatron.param_offload=${offload} \ + actor_rollout_ref.actor.megatron.optimizer_offload=${offload} \ + actor_rollout_ref.actor.megatron.grad_offload=${offload} \ + actor_rollout_ref.actor.megatron.pipeline_model_parallel_size=${train_pp} \ + actor_rollout_ref.actor.megatron.tensor_model_parallel_size=${train_tp} \ + actor_rollout_ref.actor.megatron.expert_model_parallel_size=${EP} \ + actor_rollout_ref.actor.megatron.expert_tensor_parallel_size=${ETP} \ + actor_rollout_ref.actor.megatron.context_parallel_size=${CP} \ + +actor_rollout_ref.actor.megatron.override_transformer_config.apply_rope_fusion=True \ + +actor_rollout_ref.actor.megatron.override_transformer_config.recompute_granularity=selective \ + +actor_rollout_ref.actor.megatron.override_transformer_config.recompute_modules=["core_attn","layernorm","mlp"] \ + +actor_rollout_ref.actor.megatron.override_transformer_config.gradient_accumulation_fusion=True \ + +actor_rollout_ref.actor.megatron.override_transformer_config.account_for_embedding_in_pipeline_split=False \ + +actor_rollout_ref.actor.megatron.override_transformer_config.account_for_loss_in_pipeline_split=False \ + actor_rollout_ref.ref.megatron.param_offload=${offload} \ + actor_rollout_ref.ref.megatron.pipeline_model_parallel_size=${train_pp} \ + actor_rollout_ref.ref.megatron.tensor_model_parallel_size=${train_tp} \ + actor_rollout_ref.ref.megatron.expert_model_parallel_size=${EP} \ + actor_rollout_ref.ref.megatron.expert_tensor_parallel_size=${ETP} \ + actor_rollout_ref.ref.megatron.context_parallel_size=${CP} \ + actor_rollout_ref.actor.megatron.use_mbridge=True + + # +actor_rollout_ref.actor.megatron.override_transformer_config.apply_rope_fusion=False \ + # +actor_rollout_ref.actor.megatron.override_transformer_config.moe_router_dtype=fp32 \ + # +actor_rollout_ref.actor.megatron.override_transformer_config.moe_shared_expert_overlap=False \ + # +actor_rollout_ref.actor.megatron.override_transformer_config.moe_enable_deepep=False \ + # +actor_rollout_ref.actor.megatron.override_transformer_config.moe_token_dispatcher_type=alltoall \ + # +actor_rollout_ref.actor.megatron.override_transformer_config.recompute_method=uniform \ + # +actor_rollout_ref.actor.megatron.override_transformer_config.recompute_granularity=full \ + # actor_rollout_ref.actor.megatron.override_transformer_config.recompute_granularity="selective" \ + # actor_rollout_ref.actor.megatron.override_transformer_config.recompute_modules=["core_attn","moe_act","layernorm","mlp","moe"] \ + # +actor_rollout_ref.actor.megatron.override_transformer_config.recompute_num_layers=1 \ + # +actor_rollout_ref.actor.megatron.override_transformer_config.gradient_accumulation_fusion=True \ + # +actor_rollout_ref.actor.megatron.override_transformer_config.moe_permute_fusion=True \ + # +actor_rollout_ref.actor.megatron.override_transformer_config.account_for_embedding_in_pipeline_split=False \ + # +actor_rollout_ref.actor.megatron.override_transformer_config.account_for_loss_in_pipeline_split=False \ + # +actor_rollout_ref.actor.megatron.override_transformer_config.num_layers_in_last_pipeline_stage=${last_layer} \ \ No newline at end of file diff --git a/recipe/fully_async_policy/exp/qwen3-32B_64/megatron_colocate/runtime_env.yaml b/recipe/fully_async_policy/exp/qwen3-32B_64/megatron_colocate/runtime_env.yaml new file mode 100644 index 00000000000..d3dc7176f0a --- /dev/null +++ b/recipe/fully_async_policy/exp/qwen3-32B_64/megatron_colocate/runtime_env.yaml @@ -0,0 +1,5 @@ +env_vars: + TENSORBOARD_DIR: "/mnt/dolphinfs/ssd_pool/docker/user/hadoop-friday-studio/FTI/houzhenggang/expeirments/qwen3-32B-64/dapo_qwen3-32B_32k_megatron_colocate_64_mbs32" + HYDRA_FULL_ERROR: "1" + TORCH_NCCL_AVOID_RECORD_STREAMS: "1" + CUDA_DEVICE_MAX_CONNECTIONS: "1" \ No newline at end of file From b0586059f38495a7b81b9cb35a8ee86dfcf9c316 Mon Sep 17 00:00:00 2001 From: ArronHZG Date: Sat, 6 Sep 2025 01:28:38 +0800 Subject: [PATCH 104/182] megatron fix --- .../megatron_colocate/early_megatron_colocate.sh | 2 ++ .../megatron_colocate/early_megatron_colocate.sh | 2 ++ .../megatron_colocate/early_megatron_colocate.sh | 2 ++ .../qwen3-32B_128/megatron_colocate/early_megatron_colocate.sh | 2 ++ .../qwen3-32B_32/megatron_colocate/early_megatron_colocate.sh | 2 ++ .../qwen3-32B_64/megatron_colocate/early_megatron_colocate.sh | 2 ++ recipe/r1/tasks/DocQA.py | 0 7 files changed, 12 insertions(+) create mode 100644 recipe/r1/tasks/DocQA.py diff --git a/recipe/fully_async_policy/exp/qwen3-30BA3B_128/megatron_colocate/early_megatron_colocate.sh b/recipe/fully_async_policy/exp/qwen3-30BA3B_128/megatron_colocate/early_megatron_colocate.sh index 26507694635..c666034ffc3 100644 --- a/recipe/fully_async_policy/exp/qwen3-30BA3B_128/megatron_colocate/early_megatron_colocate.sh +++ b/recipe/fully_async_policy/exp/qwen3-30BA3B_128/megatron_colocate/early_megatron_colocate.sh @@ -115,6 +115,8 @@ python3 -m verl.trainer.main_ppo \ trainer.default_local_dir="${CKPTS_DIR}" \ trainer.resume_mode=auto \ trainer.log_val_generations=10 \ + critic.strategy=megatron \ + actor_rollout_ref.actor.strategy=megatron \ actor_rollout_ref.actor.megatron.param_offload=${offload} \ actor_rollout_ref.actor.megatron.optimizer_offload=${offload} \ actor_rollout_ref.actor.megatron.grad_offload=${offload} \ diff --git a/recipe/fully_async_policy/exp/qwen3-30BA3B_32/megatron_colocate/early_megatron_colocate.sh b/recipe/fully_async_policy/exp/qwen3-30BA3B_32/megatron_colocate/early_megatron_colocate.sh index 69e5a723e9b..b2d735f8704 100644 --- a/recipe/fully_async_policy/exp/qwen3-30BA3B_32/megatron_colocate/early_megatron_colocate.sh +++ b/recipe/fully_async_policy/exp/qwen3-30BA3B_32/megatron_colocate/early_megatron_colocate.sh @@ -115,6 +115,8 @@ python3 -m verl.trainer.main_ppo \ trainer.default_local_dir="${CKPTS_DIR}" \ trainer.resume_mode=auto \ trainer.log_val_generations=10 \ + critic.strategy=megatron \ + actor_rollout_ref.actor.strategy=megatron \ actor_rollout_ref.actor.megatron.param_offload=${offload} \ actor_rollout_ref.actor.megatron.optimizer_offload=${offload} \ actor_rollout_ref.actor.megatron.grad_offload=${offload} \ diff --git a/recipe/fully_async_policy/exp/qwen3-30BA3B_64/megatron_colocate/early_megatron_colocate.sh b/recipe/fully_async_policy/exp/qwen3-30BA3B_64/megatron_colocate/early_megatron_colocate.sh index 8e632a9dbfb..336d105cc5c 100644 --- a/recipe/fully_async_policy/exp/qwen3-30BA3B_64/megatron_colocate/early_megatron_colocate.sh +++ b/recipe/fully_async_policy/exp/qwen3-30BA3B_64/megatron_colocate/early_megatron_colocate.sh @@ -115,6 +115,8 @@ python3 -m verl.trainer.main_ppo \ trainer.default_local_dir="${CKPTS_DIR}" \ trainer.resume_mode=auto \ trainer.log_val_generations=10 \ + critic.strategy=megatron \ + actor_rollout_ref.actor.strategy=megatron \ actor_rollout_ref.actor.megatron.param_offload=${offload} \ actor_rollout_ref.actor.megatron.optimizer_offload=${offload} \ actor_rollout_ref.actor.megatron.grad_offload=${offload} \ diff --git a/recipe/fully_async_policy/exp/qwen3-32B_128/megatron_colocate/early_megatron_colocate.sh b/recipe/fully_async_policy/exp/qwen3-32B_128/megatron_colocate/early_megatron_colocate.sh index 3b9ce953d85..a7535e3575d 100644 --- a/recipe/fully_async_policy/exp/qwen3-32B_128/megatron_colocate/early_megatron_colocate.sh +++ b/recipe/fully_async_policy/exp/qwen3-32B_128/megatron_colocate/early_megatron_colocate.sh @@ -115,6 +115,8 @@ python3 -m verl.trainer.main_ppo \ trainer.default_local_dir="${CKPTS_DIR}" \ trainer.resume_mode=auto \ trainer.log_val_generations=10 \ + critic.strategy=megatron \ + actor_rollout_ref.actor.strategy=megatron \ actor_rollout_ref.actor.megatron.param_offload=${offload} \ actor_rollout_ref.actor.megatron.optimizer_offload=${offload} \ actor_rollout_ref.actor.megatron.grad_offload=${offload} \ diff --git a/recipe/fully_async_policy/exp/qwen3-32B_32/megatron_colocate/early_megatron_colocate.sh b/recipe/fully_async_policy/exp/qwen3-32B_32/megatron_colocate/early_megatron_colocate.sh index 55e8733a9fb..085c7231c59 100644 --- a/recipe/fully_async_policy/exp/qwen3-32B_32/megatron_colocate/early_megatron_colocate.sh +++ b/recipe/fully_async_policy/exp/qwen3-32B_32/megatron_colocate/early_megatron_colocate.sh @@ -115,6 +115,8 @@ python3 -m verl.trainer.main_ppo \ trainer.default_local_dir="${CKPTS_DIR}" \ trainer.resume_mode=auto \ trainer.log_val_generations=10 \ + critic.strategy=megatron \ + actor_rollout_ref.actor.strategy=megatron \ actor_rollout_ref.actor.megatron.param_offload=${offload} \ actor_rollout_ref.actor.megatron.optimizer_offload=${offload} \ actor_rollout_ref.actor.megatron.grad_offload=${offload} \ diff --git a/recipe/fully_async_policy/exp/qwen3-32B_64/megatron_colocate/early_megatron_colocate.sh b/recipe/fully_async_policy/exp/qwen3-32B_64/megatron_colocate/early_megatron_colocate.sh index 280d3e19dbf..145ea3dbec9 100644 --- a/recipe/fully_async_policy/exp/qwen3-32B_64/megatron_colocate/early_megatron_colocate.sh +++ b/recipe/fully_async_policy/exp/qwen3-32B_64/megatron_colocate/early_megatron_colocate.sh @@ -115,6 +115,8 @@ python3 -m verl.trainer.main_ppo \ trainer.default_local_dir="${CKPTS_DIR}" \ trainer.resume_mode=auto \ trainer.log_val_generations=10 \ + critic.strategy=megatron \ + actor_rollout_ref.actor.strategy=megatron \ actor_rollout_ref.actor.megatron.param_offload=${offload} \ actor_rollout_ref.actor.megatron.optimizer_offload=${offload} \ actor_rollout_ref.actor.megatron.grad_offload=${offload} \ diff --git a/recipe/r1/tasks/DocQA.py b/recipe/r1/tasks/DocQA.py new file mode 100644 index 00000000000..e69de29bb2d From 0be55004b9f20f525c93d92c2f7f3e91d8641b2b Mon Sep 17 00:00:00 2001 From: ArronHZG Date: Sat, 6 Sep 2025 01:29:01 +0800 Subject: [PATCH 105/182] rm DocQA --- recipe/r1/tasks/DocQA.py | 0 1 file changed, 0 insertions(+), 0 deletions(-) delete mode 100644 recipe/r1/tasks/DocQA.py diff --git a/recipe/r1/tasks/DocQA.py b/recipe/r1/tasks/DocQA.py deleted file mode 100644 index e69de29bb2d..00000000000 From 7f837ac3a4a2b98b217d393a4170456223cdbb05 Mon Sep 17 00:00:00 2001 From: ArronHZG Date: Sun, 7 Sep 2025 01:56:36 +0800 Subject: [PATCH 106/182] update 7b 128 --- .../dapo_7b_math_fsdp2_colocate.sh | 133 +++++++++++++ .../fsdp2_colocate/runtime_env.yaml | 3 + ..._fsdp2_fully-async_8-24_mbs32_tpf_tfq20.sh | 174 ++++++++++++++++++ .../fsdp2_fully-async_16-16/runtime_env.yaml | 5 + ..._fsdp2_fully-async_8-24_mbs32_tpf_tfq20.sh | 174 ++++++++++++++++++ .../fsdp2_fully-async_24-8/runtime_env.yaml | 5 + ..._fsdp2_fully-async_8-24_mbs32_tpf_tfq20.sh | 174 ++++++++++++++++++ .../fsdp2_fully-async_8-24/runtime_env.yaml | 5 + .../dapo_7b_math_megatron_colocate.sh | 135 ++++++++++++++ .../megatron_colocate/runtime_env.yaml | 3 + .../dapo_7b_math_megatron_colocate.sh | 4 +- .../dapo_7b_math_megatron_colocate.sh | 4 +- .../fsdp2_colocate/runtime_env.yaml | 3 + .../test_dapo_qwen3_30b_math.sh | 125 +++++++++++++ .../exp/qwen3-32B_128/fsdp2_colocate/fsdp2.sh | 125 +++++++++++++ .../fsdp2_colocate/runtime_env.yaml | 3 + 16 files changed, 1071 insertions(+), 4 deletions(-) create mode 100644 recipe/fully_async_policy/exp/qwen2-7B-math_128/fsdp2_colocate/dapo_7b_math_fsdp2_colocate.sh create mode 100644 recipe/fully_async_policy/exp/qwen2-7B-math_128/fsdp2_colocate/runtime_env.yaml create mode 100644 recipe/fully_async_policy/exp/qwen2-7B-math_128/fsdp2_fully-async_16-16/dapo_qwen2-7B-math_28k_fsdp2_fully-async_8-24_mbs32_tpf_tfq20.sh create mode 100644 recipe/fully_async_policy/exp/qwen2-7B-math_128/fsdp2_fully-async_16-16/runtime_env.yaml create mode 100644 recipe/fully_async_policy/exp/qwen2-7B-math_128/fsdp2_fully-async_24-8/dapo_qwen2-7B-math_28k_fsdp2_fully-async_8-24_mbs32_tpf_tfq20.sh create mode 100644 recipe/fully_async_policy/exp/qwen2-7B-math_128/fsdp2_fully-async_24-8/runtime_env.yaml create mode 100644 recipe/fully_async_policy/exp/qwen2-7B-math_128/fsdp2_fully-async_8-24/dapo_qwen2-7B-math_28k_fsdp2_fully-async_8-24_mbs32_tpf_tfq20.sh create mode 100644 recipe/fully_async_policy/exp/qwen2-7B-math_128/fsdp2_fully-async_8-24/runtime_env.yaml create mode 100644 recipe/fully_async_policy/exp/qwen2-7B-math_128/megatron_colocate/dapo_7b_math_megatron_colocate.sh create mode 100644 recipe/fully_async_policy/exp/qwen2-7B-math_128/megatron_colocate/runtime_env.yaml create mode 100644 recipe/fully_async_policy/exp/qwen3-30BA3B_128/fsdp2_colocate/runtime_env.yaml create mode 100644 recipe/fully_async_policy/exp/qwen3-30BA3B_128/fsdp2_colocate/test_dapo_qwen3_30b_math.sh create mode 100644 recipe/fully_async_policy/exp/qwen3-32B_128/fsdp2_colocate/fsdp2.sh create mode 100644 recipe/fully_async_policy/exp/qwen3-32B_128/fsdp2_colocate/runtime_env.yaml diff --git a/recipe/fully_async_policy/exp/qwen2-7B-math_128/fsdp2_colocate/dapo_7b_math_fsdp2_colocate.sh b/recipe/fully_async_policy/exp/qwen2-7B-math_128/fsdp2_colocate/dapo_7b_math_fsdp2_colocate.sh new file mode 100644 index 00000000000..3538722d8a1 --- /dev/null +++ b/recipe/fully_async_policy/exp/qwen2-7B-math_128/fsdp2_colocate/dapo_7b_math_fsdp2_colocate.sh @@ -0,0 +1,133 @@ +#!/usr/bin/env bash +set -xeuo pipefail + +project_name='DAPO' +exp_name='dapo_qwen2-7B-math_28k_fsdp2_colocate_128_mbs32' + +adv_estimator=grpo + +use_kl_in_reward=False +kl_coef=0.0 +use_kl_loss=False +kl_loss_coef=0.0 + +clip_ratio_low=0.2 +clip_ratio_high=0.28 + +max_prompt_length=$((1024 * 2)) +max_response_length=$((1024 * 28)) +enable_overlong_buffer=True +overlong_buffer_len=$((1024 * 4)) +overlong_penalty_factor=1.0 + +loss_agg_mode="token-mean" + +train_prompt_bsz=512 +n_resp_per_prompt=16 +train_prompt_mini_bsz=32 + +# Ray +# RAY_ADDRESS=${RAY_ADDRESS:-"http://localhost:8265"} +# WORKING_DIR=${WORKING_DIR:-"${PWD}"} +# RUNTIME_ENV=${RUNTIME_ENV:-"${WORKING_DIR}/verl/trainer/runtime_env.yaml"} +NNODES=${NNODES:-16} +NGPUS_PER_NODE=${NGPUS_PER_NODE:-8} +# Paths +RAY_DATA_HOME=${RAY_DATA_HOME:-"${HOME}/verl"} +# very important! please modify the max_position_embeddings in config.json to 32768 after downloading from huggingface + +MODEL_PATH=/mnt/dolphinfs/ssd_pool/docker/user/hadoop-friday-studio/FTI/houzhenggang/model/Qwen2___5-Math-7B +CKPTS_DIR=./ckpts/${project_name}/${exp_name} +TRAIN_FILE=/mnt/dolphinfs/ssd_pool/docker/user/hadoop-friday-studio/FTI/houzhenggang/data/dapo/dapo-math-17k.parquet +TEST_FILE=/mnt/dolphinfs/ssd_pool/docker/user/hadoop-friday-studio/FTI/houzhenggang/data/dapo/aime-2024.parquet +# Algorithm +temperature=1.0 +top_p=1.0 +top_k=-1 # 0 for HF rollout, -1 for vLLM rollout +val_top_p=0.7 + +# Performance Related Parameter +use_dynamic_bsz=True +actor_ppo_max_token_len=$(((max_prompt_length + max_response_length) * 2)) +infer_ppo_max_token_len=$(((max_prompt_length + max_response_length) * 3)) +offload=True +gen_tp=4 +sp_size=4 +fsdp_size=2 + +# reference run wandb: https://wandb.ai/verl-org/DAPO%20Reproduction%20on%20verl/runs/ow47vvon?nw=nwusertongyuxuan361 + + +python -m verl.trainer.main_ppo \ + data.train_files="${TRAIN_FILE}" \ + data.val_files="${TEST_FILE}" \ + data.prompt_key=prompt \ + data.truncation='left' \ + data.max_prompt_length=${max_prompt_length} \ + data.max_response_length=${max_response_length} \ + data.train_batch_size=${train_prompt_bsz} \ + actor_rollout_ref.rollout.n=${n_resp_per_prompt} \ + algorithm.adv_estimator=${adv_estimator} \ + algorithm.use_kl_in_reward=${use_kl_in_reward} \ + algorithm.kl_ctrl.kl_coef=${kl_coef} \ + actor_rollout_ref.actor.strategy=fsdp2 \ + critic.strategy=fsdp2 \ + actor_rollout_ref.actor.use_kl_loss=${use_kl_loss} \ + actor_rollout_ref.actor.kl_loss_coef=${kl_loss_coef} \ + actor_rollout_ref.actor.clip_ratio_low=${clip_ratio_low} \ + actor_rollout_ref.actor.clip_ratio_high=${clip_ratio_high} \ + actor_rollout_ref.actor.clip_ratio_c=10.0 \ + actor_rollout_ref.model.use_remove_padding=True \ + +actor_rollout_ref.model.override_config.max_position_embeddings=32768 \ + actor_rollout_ref.actor.use_dynamic_bsz=${use_dynamic_bsz} \ + actor_rollout_ref.ref.log_prob_use_dynamic_bsz=${use_dynamic_bsz} \ + actor_rollout_ref.rollout.log_prob_use_dynamic_bsz=${use_dynamic_bsz} \ + actor_rollout_ref.actor.ppo_max_token_len_per_gpu=${actor_ppo_max_token_len} \ + actor_rollout_ref.ref.log_prob_max_token_len_per_gpu=${infer_ppo_max_token_len} \ + actor_rollout_ref.rollout.log_prob_max_token_len_per_gpu=${infer_ppo_max_token_len} \ + actor_rollout_ref.model.path="${MODEL_PATH}" \ + actor_rollout_ref.model.enable_gradient_checkpointing=True \ + actor_rollout_ref.actor.optim.lr=1e-6 \ + actor_rollout_ref.actor.optim.lr_warmup_steps=10 \ + actor_rollout_ref.actor.optim.weight_decay=0.1 \ + actor_rollout_ref.actor.ppo_mini_batch_size=${train_prompt_mini_bsz} \ + actor_rollout_ref.actor.fsdp_config.param_offload=${offload} \ + actor_rollout_ref.actor.fsdp_config.optimizer_offload=${offload} \ + actor_rollout_ref.actor.entropy_coeff=0 \ + actor_rollout_ref.actor.grad_clip=1.0 \ + actor_rollout_ref.actor.loss_agg_mode=${loss_agg_mode} \ + actor_rollout_ref.actor.ulysses_sequence_parallel_size=${sp_size} \ + actor_rollout_ref.rollout.gpu_memory_utilization=0.80 \ + actor_rollout_ref.rollout.tensor_model_parallel_size=${gen_tp} \ + actor_rollout_ref.rollout.enable_chunked_prefill=True \ + actor_rollout_ref.rollout.max_num_batched_tokens=$((max_prompt_length + max_response_length)) \ + actor_rollout_ref.rollout.temperature=${temperature} \ + actor_rollout_ref.rollout.top_p=${top_p} \ + actor_rollout_ref.rollout.top_k=${top_k} \ + actor_rollout_ref.rollout.val_kwargs.temperature=${temperature} \ + actor_rollout_ref.rollout.val_kwargs.top_p=${val_top_p} \ + actor_rollout_ref.rollout.val_kwargs.top_k=${top_k} \ + actor_rollout_ref.rollout.val_kwargs.do_sample=True \ + actor_rollout_ref.rollout.val_kwargs.n=1 \ + actor_rollout_ref.ref.fsdp_config.param_offload=${offload} \ + actor_rollout_ref.ref.ulysses_sequence_parallel_size=${sp_size} \ + actor_rollout_ref.actor.fsdp_config.fsdp_size=${fsdp_size} \ + reward_model.reward_manager=dapo \ + +reward_model.reward_kwargs.overlong_buffer_cfg.enable=${enable_overlong_buffer} \ + +reward_model.reward_kwargs.overlong_buffer_cfg.len=${overlong_buffer_len} \ + +reward_model.reward_kwargs.overlong_buffer_cfg.penalty_factor=${overlong_penalty_factor} \ + +reward_model.reward_kwargs.overlong_buffer_cfg.log=False \ + +reward_model.reward_kwargs.max_resp_len=${max_response_length} \ + trainer.logger=['console','tensorboard'] \ + trainer.project_name="${project_name}" \ + trainer.experiment_name="${exp_name}" \ + trainer.n_gpus_per_node="${NGPUS_PER_NODE}" \ + trainer.nnodes="${NNODES}" \ + trainer.val_before_train=True \ + trainer.test_freq=20 \ + trainer.save_freq=-1 \ + trainer.total_epochs=10 \ + trainer.total_training_steps=400 \ + trainer.default_local_dir="${CKPTS_DIR}" \ + trainer.resume_mode=auto \ + trainer.log_val_generations=10 diff --git a/recipe/fully_async_policy/exp/qwen2-7B-math_128/fsdp2_colocate/runtime_env.yaml b/recipe/fully_async_policy/exp/qwen2-7B-math_128/fsdp2_colocate/runtime_env.yaml new file mode 100644 index 00000000000..8fc2de3e70b --- /dev/null +++ b/recipe/fully_async_policy/exp/qwen2-7B-math_128/fsdp2_colocate/runtime_env.yaml @@ -0,0 +1,3 @@ +env_vars: + TENSORBOARD_DIR: "/mnt/dolphinfs/ssd_pool/docker/user/hadoop-friday-studio/FTI/houzhenggang/expeirments/qwen2-7B-math-128/dapo_qwen2-7B-math_28k_fsdp2_colocate_128_mbs32" + HYDRA_FULL_ERROR: "1" \ No newline at end of file diff --git a/recipe/fully_async_policy/exp/qwen2-7B-math_128/fsdp2_fully-async_16-16/dapo_qwen2-7B-math_28k_fsdp2_fully-async_8-24_mbs32_tpf_tfq20.sh b/recipe/fully_async_policy/exp/qwen2-7B-math_128/fsdp2_fully-async_16-16/dapo_qwen2-7B-math_28k_fsdp2_fully-async_8-24_mbs32_tpf_tfq20.sh new file mode 100644 index 00000000000..618497c0257 --- /dev/null +++ b/recipe/fully_async_policy/exp/qwen2-7B-math_128/fsdp2_fully-async_16-16/dapo_qwen2-7B-math_28k_fsdp2_fully-async_8-24_mbs32_tpf_tfq20.sh @@ -0,0 +1,174 @@ +#!/usr/bin/env bash +set -xeuo pipefail +# dapo_qwen2-7B-math_28k_fsdp2_fully-async_8-24_mbs32_tpf_tfq20 +project_name='DAPO' +exp_name='dapo_qwen2-7B-math_28k_fsdp2_fully-async_8-24_mbs32_tpf_tfq20' + +# Ray +# RAY_ADDRESS=${RAY_ADDRESS:-"http://localhost:8265"} +# WORKING_DIR=${WORKING_DIR:-"${PWD}"} +# RUNTIME_ENV=${RUNTIME_ENV:-"${WORKING_DIR}/verl/trainer/runtime_env.yaml"} +# Paths +RAY_DATA_HOME=${RAY_DATA_HOME:-"${HOME}/verl"} +# very important! please modify the max_position_embeddings in config.json to 32768 after downloading from huggingface +MODEL_PATH=${MODEL_PATH:-"${RAY_DATA_HOME}/models/Qwen2.5-Math-7B"} +CKPTS_DIR=${CKPTS_DIR:-"${RAY_DATA_HOME}/ckpts/${project_name}/${exp_name}"} +TRAIN_FILE=${TRAIN_FILE:-"${RAY_DATA_HOME}/data/dapo-math-17k.parquet"} +TEST_FILE=${TEST_FILE:-"${RAY_DATA_HOME}/data/aime-2024.parquet"} + +# MODEL_PATH=/mnt/dolphinfs/hdd_pool/docker/user/hadoop-djst-algoplat/houzhenggang/modelscope/hub/models/Qwen/Qwen2___5-Math-7B +MODEL_PATH=/mnt/dolphinfs/ssd_pool/docker/user/hadoop-friday-studio/FTI/houzhenggang/model/Qwen2___5-Math-7B +CKPTS_DIR=./ckpts/${project_name}/${exp_name} +# TRAIN_FILE=/mnt/dolphinfs/hdd_pool/docker/user/hadoop-djst-algoplat/houzhenggang/data/dapo/dapo-math-17k.parquet +# TEST_FILE=/mnt/dolphinfs/hdd_pool/docker/user/hadoop-djst-algoplat/houzhenggang/data/dapo/aime-2024.parquet +TRAIN_FILE=/mnt/dolphinfs/ssd_pool/docker/user/hadoop-friday-studio/FTI/houzhenggang/data/dapo/dapo-math-17k.parquet +TEST_FILE=/mnt/dolphinfs/ssd_pool/docker/user/hadoop-friday-studio/FTI/houzhenggang/data/dapo/aime-2024.parquet + +rollout_mode="async" +rollout_name="vllm" # sglang or vllm +if [ "$rollout_mode" = "async" ]; then + export VLLM_USE_V1=1 + return_raw_chat="True" +fi + +# Algorithm parameters +adv_estimator=grpo + +use_kl_in_reward=False +kl_coef=0.0 +use_kl_loss=False +kl_loss_coef=0.0 + +clip_ratio_low=0.2 +clip_ratio_high=0.28 + +# Response length parameters +max_prompt_length=$((1024 * 2)) +max_response_length=$((1024 *8)) +enable_overlong_buffer=True +overlong_buffer_len=$((1024 * 4)) +overlong_penalty_factor=1.0 + +# Training parameters +loss_agg_mode="token-mean" + +# Algorithm +temperature=1.0 +top_p=1.0 +top_k=-1 # 0 for HF rollout, -1 for vLLM rollout +val_top_p=0.7 + +# Performance Related Parameter +use_dynamic_bsz=True +actor_ppo_max_token_len=$(((max_prompt_length + max_response_length) * 2)) +infer_ppo_max_token_len=$(((max_prompt_length + max_response_length) * 3)) +ref_offload=True +actor_offload=False +gen_tp=1 +sp_size=1 +fsdp_size=2 + +# Fully async specific parameters +NNODES=${NNODES:-2} +NGPUS_PER_NODE=${NGPUS_PER_NODE:-8} + +n_gpus_rollout=6 +n_gpus_training=$((NGPUS_PER_NODE - n_gpus_rollout)) + +train_prompt_bsz=0 +gen_prompt_bsz=1 +n_resp_per_prompt=16 +train_prompt_mini_bsz=32 +total_rollout_steps=$(((512*100))) +test_freq=10 +staleness_threshold=1 +trigger_parameter_sync_step=64 +partial_rollout=True + +PYTHON_INTERPRETER="/home/hadoop-djst-algoplat/miniconda3/bin/python" +if [ ! -x "$PYTHON_INTERPRETER" ]; then + PYTHON_INTERPRETER="python3" +fi + +$PYTHON_INTERPRETER -m recipe.fully_async_policy.fully_async_main \ + data.train_files="${TRAIN_FILE}" \ + data.val_files="${TEST_FILE}" \ + data.prompt_key=prompt \ + data.truncation='left' \ + data.max_prompt_length=${max_prompt_length} \ + data.max_response_length=${max_response_length} \ + data.train_batch_size=${train_prompt_bsz} \ + data.gen_batch_size=${gen_prompt_bsz} \ + data.return_raw_chat=${return_raw_chat} \ + actor_rollout_ref.rollout.n=${n_resp_per_prompt} \ + algorithm.adv_estimator=${adv_estimator} \ + algorithm.use_kl_in_reward=${use_kl_in_reward} \ + algorithm.kl_ctrl.kl_coef=${kl_coef} \ + actor_rollout_ref.actor.strategy=fsdp2 \ + critic.strategy=fsdp2 \ + actor_rollout_ref.actor.use_kl_loss=${use_kl_loss} \ + actor_rollout_ref.actor.kl_loss_coef=${kl_loss_coef} \ + actor_rollout_ref.actor.clip_ratio_low=${clip_ratio_low} \ + actor_rollout_ref.actor.clip_ratio_high=${clip_ratio_high} \ + actor_rollout_ref.actor.clip_ratio_c=10.0 \ + actor_rollout_ref.model.use_remove_padding=True \ + actor_rollout_ref.hybrid_engine=False \ + +actor_rollout_ref.model.override_config.max_position_embeddings=32768 \ + actor_rollout_ref.actor.use_dynamic_bsz=${use_dynamic_bsz} \ + actor_rollout_ref.ref.log_prob_use_dynamic_bsz=${use_dynamic_bsz} \ + actor_rollout_ref.rollout.log_prob_use_dynamic_bsz=${use_dynamic_bsz} \ + actor_rollout_ref.actor.ppo_max_token_len_per_gpu=${actor_ppo_max_token_len} \ + actor_rollout_ref.ref.log_prob_max_token_len_per_gpu=${infer_ppo_max_token_len} \ + actor_rollout_ref.rollout.log_prob_max_token_len_per_gpu=${infer_ppo_max_token_len} \ + actor_rollout_ref.model.path="${MODEL_PATH}" \ + actor_rollout_ref.actor.optim.lr=1e-6 \ + actor_rollout_ref.actor.optim.lr_warmup_steps=10 \ + actor_rollout_ref.actor.optim.weight_decay=0.1 \ + actor_rollout_ref.actor.ppo_mini_batch_size=${train_prompt_mini_bsz} \ + actor_rollout_ref.actor.fsdp_config.param_offload=${actor_offload} \ + actor_rollout_ref.actor.fsdp_config.optimizer_offload=${actor_offload} \ + actor_rollout_ref.actor.entropy_coeff=0 \ + actor_rollout_ref.actor.grad_clip=1.0 \ + actor_rollout_ref.actor.loss_agg_mode=${loss_agg_mode} \ + actor_rollout_ref.actor.ulysses_sequence_parallel_size=${sp_size} \ + actor_rollout_ref.rollout.gpu_memory_utilization=0.80 \ + actor_rollout_ref.rollout.tensor_model_parallel_size=${gen_tp} \ + actor_rollout_ref.rollout.enable_chunked_prefill=True \ + actor_rollout_ref.rollout.max_num_batched_tokens=$((max_prompt_length + max_response_length)) \ + actor_rollout_ref.rollout.temperature=${temperature} \ + actor_rollout_ref.rollout.top_p=${top_p} \ + actor_rollout_ref.rollout.top_k=${top_k} \ + actor_rollout_ref.rollout.val_kwargs.temperature=${temperature} \ + actor_rollout_ref.rollout.val_kwargs.top_p=${val_top_p} \ + actor_rollout_ref.rollout.val_kwargs.top_k=${top_k} \ + actor_rollout_ref.rollout.val_kwargs.do_sample=True \ + actor_rollout_ref.rollout.val_kwargs.n=1 \ + actor_rollout_ref.ref.fsdp_config.param_offload=${ref_offload} \ + actor_rollout_ref.ref.ulysses_sequence_parallel_size=${sp_size} \ + actor_rollout_ref.actor.fsdp_config.fsdp_size=${fsdp_size} \ + actor_rollout_ref.rollout.name=${rollout_name} \ + actor_rollout_ref.rollout.mode=${rollout_mode} \ + actor_rollout_ref.rollout.calculate_log_probs=True \ + reward_model.reward_manager=dapo \ + +reward_model.reward_kwargs.overlong_buffer_cfg.enable=${enable_overlong_buffer} \ + +reward_model.reward_kwargs.overlong_buffer_cfg.len=${overlong_buffer_len} \ + +reward_model.reward_kwargs.overlong_buffer_cfg.penalty_factor=${overlong_penalty_factor} \ + +reward_model.reward_kwargs.overlong_buffer_cfg.log=False \ + +reward_model.reward_kwargs.max_resp_len=${max_response_length} \ + trainer.logger=['console','tensorboard'] \ + trainer.project_name="${project_name}" \ + trainer.experiment_name="${exp_name}" \ + trainer.val_before_train=True \ + trainer.save_freq=-1 \ + trainer.default_local_dir="${CKPTS_DIR}" \ + trainer.resume_mode=auto \ + trainer.nnodes="${NNODES}" \ + trainer.n_gpus_per_node="${n_gpus_training}" \ + rollout.nnodes="${NNODES}" \ + rollout.n_gpus_per_node="${n_gpus_rollout}" \ + rollout.total_rollout_steps="${total_rollout_steps}" \ + rollout.total_epochs=10 \ + rollout.test_freq="${test_freq}" \ + async_training.staleness_threshold="${staleness_threshold}" \ + async_training.trigger_parameter_sync_step="${trigger_parameter_sync_step}" \ + async_training.partial_rollout="${partial_rollout}" diff --git a/recipe/fully_async_policy/exp/qwen2-7B-math_128/fsdp2_fully-async_16-16/runtime_env.yaml b/recipe/fully_async_policy/exp/qwen2-7B-math_128/fsdp2_fully-async_16-16/runtime_env.yaml new file mode 100644 index 00000000000..dcca08e67f7 --- /dev/null +++ b/recipe/fully_async_policy/exp/qwen2-7B-math_128/fsdp2_fully-async_16-16/runtime_env.yaml @@ -0,0 +1,5 @@ +env_vars: + VLLM_USE_V1: "1" + TENSORBOARD_DIR: "/mnt/dolphinfs/hdd_pool/docker/user/hadoop-djst-algoplat/houzhenggang/tensorboard/dapo_7b_math_fsdp2_4_12" + NCCL_DEBUG: "INFO" + HYDRA_FULL_ERROR: "1" \ No newline at end of file diff --git a/recipe/fully_async_policy/exp/qwen2-7B-math_128/fsdp2_fully-async_24-8/dapo_qwen2-7B-math_28k_fsdp2_fully-async_8-24_mbs32_tpf_tfq20.sh b/recipe/fully_async_policy/exp/qwen2-7B-math_128/fsdp2_fully-async_24-8/dapo_qwen2-7B-math_28k_fsdp2_fully-async_8-24_mbs32_tpf_tfq20.sh new file mode 100644 index 00000000000..618497c0257 --- /dev/null +++ b/recipe/fully_async_policy/exp/qwen2-7B-math_128/fsdp2_fully-async_24-8/dapo_qwen2-7B-math_28k_fsdp2_fully-async_8-24_mbs32_tpf_tfq20.sh @@ -0,0 +1,174 @@ +#!/usr/bin/env bash +set -xeuo pipefail +# dapo_qwen2-7B-math_28k_fsdp2_fully-async_8-24_mbs32_tpf_tfq20 +project_name='DAPO' +exp_name='dapo_qwen2-7B-math_28k_fsdp2_fully-async_8-24_mbs32_tpf_tfq20' + +# Ray +# RAY_ADDRESS=${RAY_ADDRESS:-"http://localhost:8265"} +# WORKING_DIR=${WORKING_DIR:-"${PWD}"} +# RUNTIME_ENV=${RUNTIME_ENV:-"${WORKING_DIR}/verl/trainer/runtime_env.yaml"} +# Paths +RAY_DATA_HOME=${RAY_DATA_HOME:-"${HOME}/verl"} +# very important! please modify the max_position_embeddings in config.json to 32768 after downloading from huggingface +MODEL_PATH=${MODEL_PATH:-"${RAY_DATA_HOME}/models/Qwen2.5-Math-7B"} +CKPTS_DIR=${CKPTS_DIR:-"${RAY_DATA_HOME}/ckpts/${project_name}/${exp_name}"} +TRAIN_FILE=${TRAIN_FILE:-"${RAY_DATA_HOME}/data/dapo-math-17k.parquet"} +TEST_FILE=${TEST_FILE:-"${RAY_DATA_HOME}/data/aime-2024.parquet"} + +# MODEL_PATH=/mnt/dolphinfs/hdd_pool/docker/user/hadoop-djst-algoplat/houzhenggang/modelscope/hub/models/Qwen/Qwen2___5-Math-7B +MODEL_PATH=/mnt/dolphinfs/ssd_pool/docker/user/hadoop-friday-studio/FTI/houzhenggang/model/Qwen2___5-Math-7B +CKPTS_DIR=./ckpts/${project_name}/${exp_name} +# TRAIN_FILE=/mnt/dolphinfs/hdd_pool/docker/user/hadoop-djst-algoplat/houzhenggang/data/dapo/dapo-math-17k.parquet +# TEST_FILE=/mnt/dolphinfs/hdd_pool/docker/user/hadoop-djst-algoplat/houzhenggang/data/dapo/aime-2024.parquet +TRAIN_FILE=/mnt/dolphinfs/ssd_pool/docker/user/hadoop-friday-studio/FTI/houzhenggang/data/dapo/dapo-math-17k.parquet +TEST_FILE=/mnt/dolphinfs/ssd_pool/docker/user/hadoop-friday-studio/FTI/houzhenggang/data/dapo/aime-2024.parquet + +rollout_mode="async" +rollout_name="vllm" # sglang or vllm +if [ "$rollout_mode" = "async" ]; then + export VLLM_USE_V1=1 + return_raw_chat="True" +fi + +# Algorithm parameters +adv_estimator=grpo + +use_kl_in_reward=False +kl_coef=0.0 +use_kl_loss=False +kl_loss_coef=0.0 + +clip_ratio_low=0.2 +clip_ratio_high=0.28 + +# Response length parameters +max_prompt_length=$((1024 * 2)) +max_response_length=$((1024 *8)) +enable_overlong_buffer=True +overlong_buffer_len=$((1024 * 4)) +overlong_penalty_factor=1.0 + +# Training parameters +loss_agg_mode="token-mean" + +# Algorithm +temperature=1.0 +top_p=1.0 +top_k=-1 # 0 for HF rollout, -1 for vLLM rollout +val_top_p=0.7 + +# Performance Related Parameter +use_dynamic_bsz=True +actor_ppo_max_token_len=$(((max_prompt_length + max_response_length) * 2)) +infer_ppo_max_token_len=$(((max_prompt_length + max_response_length) * 3)) +ref_offload=True +actor_offload=False +gen_tp=1 +sp_size=1 +fsdp_size=2 + +# Fully async specific parameters +NNODES=${NNODES:-2} +NGPUS_PER_NODE=${NGPUS_PER_NODE:-8} + +n_gpus_rollout=6 +n_gpus_training=$((NGPUS_PER_NODE - n_gpus_rollout)) + +train_prompt_bsz=0 +gen_prompt_bsz=1 +n_resp_per_prompt=16 +train_prompt_mini_bsz=32 +total_rollout_steps=$(((512*100))) +test_freq=10 +staleness_threshold=1 +trigger_parameter_sync_step=64 +partial_rollout=True + +PYTHON_INTERPRETER="/home/hadoop-djst-algoplat/miniconda3/bin/python" +if [ ! -x "$PYTHON_INTERPRETER" ]; then + PYTHON_INTERPRETER="python3" +fi + +$PYTHON_INTERPRETER -m recipe.fully_async_policy.fully_async_main \ + data.train_files="${TRAIN_FILE}" \ + data.val_files="${TEST_FILE}" \ + data.prompt_key=prompt \ + data.truncation='left' \ + data.max_prompt_length=${max_prompt_length} \ + data.max_response_length=${max_response_length} \ + data.train_batch_size=${train_prompt_bsz} \ + data.gen_batch_size=${gen_prompt_bsz} \ + data.return_raw_chat=${return_raw_chat} \ + actor_rollout_ref.rollout.n=${n_resp_per_prompt} \ + algorithm.adv_estimator=${adv_estimator} \ + algorithm.use_kl_in_reward=${use_kl_in_reward} \ + algorithm.kl_ctrl.kl_coef=${kl_coef} \ + actor_rollout_ref.actor.strategy=fsdp2 \ + critic.strategy=fsdp2 \ + actor_rollout_ref.actor.use_kl_loss=${use_kl_loss} \ + actor_rollout_ref.actor.kl_loss_coef=${kl_loss_coef} \ + actor_rollout_ref.actor.clip_ratio_low=${clip_ratio_low} \ + actor_rollout_ref.actor.clip_ratio_high=${clip_ratio_high} \ + actor_rollout_ref.actor.clip_ratio_c=10.0 \ + actor_rollout_ref.model.use_remove_padding=True \ + actor_rollout_ref.hybrid_engine=False \ + +actor_rollout_ref.model.override_config.max_position_embeddings=32768 \ + actor_rollout_ref.actor.use_dynamic_bsz=${use_dynamic_bsz} \ + actor_rollout_ref.ref.log_prob_use_dynamic_bsz=${use_dynamic_bsz} \ + actor_rollout_ref.rollout.log_prob_use_dynamic_bsz=${use_dynamic_bsz} \ + actor_rollout_ref.actor.ppo_max_token_len_per_gpu=${actor_ppo_max_token_len} \ + actor_rollout_ref.ref.log_prob_max_token_len_per_gpu=${infer_ppo_max_token_len} \ + actor_rollout_ref.rollout.log_prob_max_token_len_per_gpu=${infer_ppo_max_token_len} \ + actor_rollout_ref.model.path="${MODEL_PATH}" \ + actor_rollout_ref.actor.optim.lr=1e-6 \ + actor_rollout_ref.actor.optim.lr_warmup_steps=10 \ + actor_rollout_ref.actor.optim.weight_decay=0.1 \ + actor_rollout_ref.actor.ppo_mini_batch_size=${train_prompt_mini_bsz} \ + actor_rollout_ref.actor.fsdp_config.param_offload=${actor_offload} \ + actor_rollout_ref.actor.fsdp_config.optimizer_offload=${actor_offload} \ + actor_rollout_ref.actor.entropy_coeff=0 \ + actor_rollout_ref.actor.grad_clip=1.0 \ + actor_rollout_ref.actor.loss_agg_mode=${loss_agg_mode} \ + actor_rollout_ref.actor.ulysses_sequence_parallel_size=${sp_size} \ + actor_rollout_ref.rollout.gpu_memory_utilization=0.80 \ + actor_rollout_ref.rollout.tensor_model_parallel_size=${gen_tp} \ + actor_rollout_ref.rollout.enable_chunked_prefill=True \ + actor_rollout_ref.rollout.max_num_batched_tokens=$((max_prompt_length + max_response_length)) \ + actor_rollout_ref.rollout.temperature=${temperature} \ + actor_rollout_ref.rollout.top_p=${top_p} \ + actor_rollout_ref.rollout.top_k=${top_k} \ + actor_rollout_ref.rollout.val_kwargs.temperature=${temperature} \ + actor_rollout_ref.rollout.val_kwargs.top_p=${val_top_p} \ + actor_rollout_ref.rollout.val_kwargs.top_k=${top_k} \ + actor_rollout_ref.rollout.val_kwargs.do_sample=True \ + actor_rollout_ref.rollout.val_kwargs.n=1 \ + actor_rollout_ref.ref.fsdp_config.param_offload=${ref_offload} \ + actor_rollout_ref.ref.ulysses_sequence_parallel_size=${sp_size} \ + actor_rollout_ref.actor.fsdp_config.fsdp_size=${fsdp_size} \ + actor_rollout_ref.rollout.name=${rollout_name} \ + actor_rollout_ref.rollout.mode=${rollout_mode} \ + actor_rollout_ref.rollout.calculate_log_probs=True \ + reward_model.reward_manager=dapo \ + +reward_model.reward_kwargs.overlong_buffer_cfg.enable=${enable_overlong_buffer} \ + +reward_model.reward_kwargs.overlong_buffer_cfg.len=${overlong_buffer_len} \ + +reward_model.reward_kwargs.overlong_buffer_cfg.penalty_factor=${overlong_penalty_factor} \ + +reward_model.reward_kwargs.overlong_buffer_cfg.log=False \ + +reward_model.reward_kwargs.max_resp_len=${max_response_length} \ + trainer.logger=['console','tensorboard'] \ + trainer.project_name="${project_name}" \ + trainer.experiment_name="${exp_name}" \ + trainer.val_before_train=True \ + trainer.save_freq=-1 \ + trainer.default_local_dir="${CKPTS_DIR}" \ + trainer.resume_mode=auto \ + trainer.nnodes="${NNODES}" \ + trainer.n_gpus_per_node="${n_gpus_training}" \ + rollout.nnodes="${NNODES}" \ + rollout.n_gpus_per_node="${n_gpus_rollout}" \ + rollout.total_rollout_steps="${total_rollout_steps}" \ + rollout.total_epochs=10 \ + rollout.test_freq="${test_freq}" \ + async_training.staleness_threshold="${staleness_threshold}" \ + async_training.trigger_parameter_sync_step="${trigger_parameter_sync_step}" \ + async_training.partial_rollout="${partial_rollout}" diff --git a/recipe/fully_async_policy/exp/qwen2-7B-math_128/fsdp2_fully-async_24-8/runtime_env.yaml b/recipe/fully_async_policy/exp/qwen2-7B-math_128/fsdp2_fully-async_24-8/runtime_env.yaml new file mode 100644 index 00000000000..dcca08e67f7 --- /dev/null +++ b/recipe/fully_async_policy/exp/qwen2-7B-math_128/fsdp2_fully-async_24-8/runtime_env.yaml @@ -0,0 +1,5 @@ +env_vars: + VLLM_USE_V1: "1" + TENSORBOARD_DIR: "/mnt/dolphinfs/hdd_pool/docker/user/hadoop-djst-algoplat/houzhenggang/tensorboard/dapo_7b_math_fsdp2_4_12" + NCCL_DEBUG: "INFO" + HYDRA_FULL_ERROR: "1" \ No newline at end of file diff --git a/recipe/fully_async_policy/exp/qwen2-7B-math_128/fsdp2_fully-async_8-24/dapo_qwen2-7B-math_28k_fsdp2_fully-async_8-24_mbs32_tpf_tfq20.sh b/recipe/fully_async_policy/exp/qwen2-7B-math_128/fsdp2_fully-async_8-24/dapo_qwen2-7B-math_28k_fsdp2_fully-async_8-24_mbs32_tpf_tfq20.sh new file mode 100644 index 00000000000..618497c0257 --- /dev/null +++ b/recipe/fully_async_policy/exp/qwen2-7B-math_128/fsdp2_fully-async_8-24/dapo_qwen2-7B-math_28k_fsdp2_fully-async_8-24_mbs32_tpf_tfq20.sh @@ -0,0 +1,174 @@ +#!/usr/bin/env bash +set -xeuo pipefail +# dapo_qwen2-7B-math_28k_fsdp2_fully-async_8-24_mbs32_tpf_tfq20 +project_name='DAPO' +exp_name='dapo_qwen2-7B-math_28k_fsdp2_fully-async_8-24_mbs32_tpf_tfq20' + +# Ray +# RAY_ADDRESS=${RAY_ADDRESS:-"http://localhost:8265"} +# WORKING_DIR=${WORKING_DIR:-"${PWD}"} +# RUNTIME_ENV=${RUNTIME_ENV:-"${WORKING_DIR}/verl/trainer/runtime_env.yaml"} +# Paths +RAY_DATA_HOME=${RAY_DATA_HOME:-"${HOME}/verl"} +# very important! please modify the max_position_embeddings in config.json to 32768 after downloading from huggingface +MODEL_PATH=${MODEL_PATH:-"${RAY_DATA_HOME}/models/Qwen2.5-Math-7B"} +CKPTS_DIR=${CKPTS_DIR:-"${RAY_DATA_HOME}/ckpts/${project_name}/${exp_name}"} +TRAIN_FILE=${TRAIN_FILE:-"${RAY_DATA_HOME}/data/dapo-math-17k.parquet"} +TEST_FILE=${TEST_FILE:-"${RAY_DATA_HOME}/data/aime-2024.parquet"} + +# MODEL_PATH=/mnt/dolphinfs/hdd_pool/docker/user/hadoop-djst-algoplat/houzhenggang/modelscope/hub/models/Qwen/Qwen2___5-Math-7B +MODEL_PATH=/mnt/dolphinfs/ssd_pool/docker/user/hadoop-friday-studio/FTI/houzhenggang/model/Qwen2___5-Math-7B +CKPTS_DIR=./ckpts/${project_name}/${exp_name} +# TRAIN_FILE=/mnt/dolphinfs/hdd_pool/docker/user/hadoop-djst-algoplat/houzhenggang/data/dapo/dapo-math-17k.parquet +# TEST_FILE=/mnt/dolphinfs/hdd_pool/docker/user/hadoop-djst-algoplat/houzhenggang/data/dapo/aime-2024.parquet +TRAIN_FILE=/mnt/dolphinfs/ssd_pool/docker/user/hadoop-friday-studio/FTI/houzhenggang/data/dapo/dapo-math-17k.parquet +TEST_FILE=/mnt/dolphinfs/ssd_pool/docker/user/hadoop-friday-studio/FTI/houzhenggang/data/dapo/aime-2024.parquet + +rollout_mode="async" +rollout_name="vllm" # sglang or vllm +if [ "$rollout_mode" = "async" ]; then + export VLLM_USE_V1=1 + return_raw_chat="True" +fi + +# Algorithm parameters +adv_estimator=grpo + +use_kl_in_reward=False +kl_coef=0.0 +use_kl_loss=False +kl_loss_coef=0.0 + +clip_ratio_low=0.2 +clip_ratio_high=0.28 + +# Response length parameters +max_prompt_length=$((1024 * 2)) +max_response_length=$((1024 *8)) +enable_overlong_buffer=True +overlong_buffer_len=$((1024 * 4)) +overlong_penalty_factor=1.0 + +# Training parameters +loss_agg_mode="token-mean" + +# Algorithm +temperature=1.0 +top_p=1.0 +top_k=-1 # 0 for HF rollout, -1 for vLLM rollout +val_top_p=0.7 + +# Performance Related Parameter +use_dynamic_bsz=True +actor_ppo_max_token_len=$(((max_prompt_length + max_response_length) * 2)) +infer_ppo_max_token_len=$(((max_prompt_length + max_response_length) * 3)) +ref_offload=True +actor_offload=False +gen_tp=1 +sp_size=1 +fsdp_size=2 + +# Fully async specific parameters +NNODES=${NNODES:-2} +NGPUS_PER_NODE=${NGPUS_PER_NODE:-8} + +n_gpus_rollout=6 +n_gpus_training=$((NGPUS_PER_NODE - n_gpus_rollout)) + +train_prompt_bsz=0 +gen_prompt_bsz=1 +n_resp_per_prompt=16 +train_prompt_mini_bsz=32 +total_rollout_steps=$(((512*100))) +test_freq=10 +staleness_threshold=1 +trigger_parameter_sync_step=64 +partial_rollout=True + +PYTHON_INTERPRETER="/home/hadoop-djst-algoplat/miniconda3/bin/python" +if [ ! -x "$PYTHON_INTERPRETER" ]; then + PYTHON_INTERPRETER="python3" +fi + +$PYTHON_INTERPRETER -m recipe.fully_async_policy.fully_async_main \ + data.train_files="${TRAIN_FILE}" \ + data.val_files="${TEST_FILE}" \ + data.prompt_key=prompt \ + data.truncation='left' \ + data.max_prompt_length=${max_prompt_length} \ + data.max_response_length=${max_response_length} \ + data.train_batch_size=${train_prompt_bsz} \ + data.gen_batch_size=${gen_prompt_bsz} \ + data.return_raw_chat=${return_raw_chat} \ + actor_rollout_ref.rollout.n=${n_resp_per_prompt} \ + algorithm.adv_estimator=${adv_estimator} \ + algorithm.use_kl_in_reward=${use_kl_in_reward} \ + algorithm.kl_ctrl.kl_coef=${kl_coef} \ + actor_rollout_ref.actor.strategy=fsdp2 \ + critic.strategy=fsdp2 \ + actor_rollout_ref.actor.use_kl_loss=${use_kl_loss} \ + actor_rollout_ref.actor.kl_loss_coef=${kl_loss_coef} \ + actor_rollout_ref.actor.clip_ratio_low=${clip_ratio_low} \ + actor_rollout_ref.actor.clip_ratio_high=${clip_ratio_high} \ + actor_rollout_ref.actor.clip_ratio_c=10.0 \ + actor_rollout_ref.model.use_remove_padding=True \ + actor_rollout_ref.hybrid_engine=False \ + +actor_rollout_ref.model.override_config.max_position_embeddings=32768 \ + actor_rollout_ref.actor.use_dynamic_bsz=${use_dynamic_bsz} \ + actor_rollout_ref.ref.log_prob_use_dynamic_bsz=${use_dynamic_bsz} \ + actor_rollout_ref.rollout.log_prob_use_dynamic_bsz=${use_dynamic_bsz} \ + actor_rollout_ref.actor.ppo_max_token_len_per_gpu=${actor_ppo_max_token_len} \ + actor_rollout_ref.ref.log_prob_max_token_len_per_gpu=${infer_ppo_max_token_len} \ + actor_rollout_ref.rollout.log_prob_max_token_len_per_gpu=${infer_ppo_max_token_len} \ + actor_rollout_ref.model.path="${MODEL_PATH}" \ + actor_rollout_ref.actor.optim.lr=1e-6 \ + actor_rollout_ref.actor.optim.lr_warmup_steps=10 \ + actor_rollout_ref.actor.optim.weight_decay=0.1 \ + actor_rollout_ref.actor.ppo_mini_batch_size=${train_prompt_mini_bsz} \ + actor_rollout_ref.actor.fsdp_config.param_offload=${actor_offload} \ + actor_rollout_ref.actor.fsdp_config.optimizer_offload=${actor_offload} \ + actor_rollout_ref.actor.entropy_coeff=0 \ + actor_rollout_ref.actor.grad_clip=1.0 \ + actor_rollout_ref.actor.loss_agg_mode=${loss_agg_mode} \ + actor_rollout_ref.actor.ulysses_sequence_parallel_size=${sp_size} \ + actor_rollout_ref.rollout.gpu_memory_utilization=0.80 \ + actor_rollout_ref.rollout.tensor_model_parallel_size=${gen_tp} \ + actor_rollout_ref.rollout.enable_chunked_prefill=True \ + actor_rollout_ref.rollout.max_num_batched_tokens=$((max_prompt_length + max_response_length)) \ + actor_rollout_ref.rollout.temperature=${temperature} \ + actor_rollout_ref.rollout.top_p=${top_p} \ + actor_rollout_ref.rollout.top_k=${top_k} \ + actor_rollout_ref.rollout.val_kwargs.temperature=${temperature} \ + actor_rollout_ref.rollout.val_kwargs.top_p=${val_top_p} \ + actor_rollout_ref.rollout.val_kwargs.top_k=${top_k} \ + actor_rollout_ref.rollout.val_kwargs.do_sample=True \ + actor_rollout_ref.rollout.val_kwargs.n=1 \ + actor_rollout_ref.ref.fsdp_config.param_offload=${ref_offload} \ + actor_rollout_ref.ref.ulysses_sequence_parallel_size=${sp_size} \ + actor_rollout_ref.actor.fsdp_config.fsdp_size=${fsdp_size} \ + actor_rollout_ref.rollout.name=${rollout_name} \ + actor_rollout_ref.rollout.mode=${rollout_mode} \ + actor_rollout_ref.rollout.calculate_log_probs=True \ + reward_model.reward_manager=dapo \ + +reward_model.reward_kwargs.overlong_buffer_cfg.enable=${enable_overlong_buffer} \ + +reward_model.reward_kwargs.overlong_buffer_cfg.len=${overlong_buffer_len} \ + +reward_model.reward_kwargs.overlong_buffer_cfg.penalty_factor=${overlong_penalty_factor} \ + +reward_model.reward_kwargs.overlong_buffer_cfg.log=False \ + +reward_model.reward_kwargs.max_resp_len=${max_response_length} \ + trainer.logger=['console','tensorboard'] \ + trainer.project_name="${project_name}" \ + trainer.experiment_name="${exp_name}" \ + trainer.val_before_train=True \ + trainer.save_freq=-1 \ + trainer.default_local_dir="${CKPTS_DIR}" \ + trainer.resume_mode=auto \ + trainer.nnodes="${NNODES}" \ + trainer.n_gpus_per_node="${n_gpus_training}" \ + rollout.nnodes="${NNODES}" \ + rollout.n_gpus_per_node="${n_gpus_rollout}" \ + rollout.total_rollout_steps="${total_rollout_steps}" \ + rollout.total_epochs=10 \ + rollout.test_freq="${test_freq}" \ + async_training.staleness_threshold="${staleness_threshold}" \ + async_training.trigger_parameter_sync_step="${trigger_parameter_sync_step}" \ + async_training.partial_rollout="${partial_rollout}" diff --git a/recipe/fully_async_policy/exp/qwen2-7B-math_128/fsdp2_fully-async_8-24/runtime_env.yaml b/recipe/fully_async_policy/exp/qwen2-7B-math_128/fsdp2_fully-async_8-24/runtime_env.yaml new file mode 100644 index 00000000000..dcca08e67f7 --- /dev/null +++ b/recipe/fully_async_policy/exp/qwen2-7B-math_128/fsdp2_fully-async_8-24/runtime_env.yaml @@ -0,0 +1,5 @@ +env_vars: + VLLM_USE_V1: "1" + TENSORBOARD_DIR: "/mnt/dolphinfs/hdd_pool/docker/user/hadoop-djst-algoplat/houzhenggang/tensorboard/dapo_7b_math_fsdp2_4_12" + NCCL_DEBUG: "INFO" + HYDRA_FULL_ERROR: "1" \ No newline at end of file diff --git a/recipe/fully_async_policy/exp/qwen2-7B-math_128/megatron_colocate/dapo_7b_math_megatron_colocate.sh b/recipe/fully_async_policy/exp/qwen2-7B-math_128/megatron_colocate/dapo_7b_math_megatron_colocate.sh new file mode 100644 index 00000000000..f98aeb86b57 --- /dev/null +++ b/recipe/fully_async_policy/exp/qwen2-7B-math_128/megatron_colocate/dapo_7b_math_megatron_colocate.sh @@ -0,0 +1,135 @@ +#!/usr/bin/env bash +set -xeuo pipefail + +project_name='DAPO' +exp_name='dapo_qwen2-7B-math_28k_megatron_colocate_64_mbs32' + +adv_estimator=grpo + +use_kl_in_reward=False +kl_coef=0.0 +use_kl_loss=False +kl_loss_coef=0.0 + +clip_ratio_low=0.2 +clip_ratio_high=0.28 + +max_prompt_length=$((1024 * 2)) +max_response_length=$((1024 * 28)) +enable_overlong_buffer=True +overlong_buffer_len=$((1024 * 4)) +overlong_penalty_factor=1.0 + +loss_agg_mode="token-mean" + +train_prompt_bsz=512 +n_resp_per_prompt=16 +train_prompt_mini_bsz=32 + +# Ray +# RAY_ADDRESS=${RAY_ADDRESS:-"http://localhost:8265"} +# WORKING_DIR=${WORKING_DIR:-"${PWD}"} +# RUNTIME_ENV=${RUNTIME_ENV:-"${WORKING_DIR}/verl/trainer/runtime_env.yaml"} +NNODES=${NNODES:-8} +NGPUS_PER_NODE=${NGPUS_PER_NODE:-8} +# Paths +MODEL_PATH=/mnt/dolphinfs/ssd_pool/docker/user/hadoop-friday-studio/FTI/houzhenggang/model/Qwen2___5-Math-7B +CKPTS_DIR=./ckpts/${project_name}/${exp_name} +TRAIN_FILE=/mnt/dolphinfs/ssd_pool/docker/user/hadoop-friday-studio/FTI/houzhenggang/data/dapo/dapo-math-17k.parquet +TEST_FILE=/mnt/dolphinfs/ssd_pool/docker/user/hadoop-friday-studio/FTI/houzhenggang/data/dapo/aime-2024.parquet + +# Algorithm +temperature=1.0 +top_p=1.0 +top_k=-1 # 0 for HF rollout, -1 for vLLM rollout +val_top_p=0.7 + +# Performance Related Parameter +use_dynamic_bsz=True +actor_ppo_max_token_len=$(((max_prompt_length + max_response_length) * 2)) +infer_ppo_max_token_len=$(((max_prompt_length + max_response_length) * 3)) +offload=True +gen_tp=4 +train_tp=4 +train_pp=2 + +# TODO: support dynamic_bsz for megatron +# actor_rollout_ref.actor.use_dynamic_bsz=${use_dynamic_bsz} \ +# actor_rollout_ref.ref.log_prob_use_dynamic_bsz=${use_dynamic_bsz} \ +# actor_rollout_ref.rollout.log_prob_use_dynamic_bsz=${use_dynamic_bsz} \ +# actor_rollout_ref.actor.ppo_max_token_len_per_gpu=${actor_ppo_max_token_len} \ +# actor_rollout_ref.ref.log_prob_max_token_len_per_gpu=${infer_ppo_max_token_len} \ +# actor_rollout_ref.rollout.log_prob_max_token_len_per_gpu=${infer_ppo_max_token_len} \ + +python3 -m verl.trainer.main_ppo \ + --config-path=config \ + --config-name='ppo_megatron_trainer.yaml' \ + data.train_files="${TRAIN_FILE}" \ + data.val_files="${TEST_FILE}" \ + data.prompt_key=prompt \ + data.truncation='left' \ + data.max_prompt_length=${max_prompt_length} \ + data.max_response_length=${max_response_length} \ + data.train_batch_size=${train_prompt_bsz} \ + actor_rollout_ref.rollout.n=${n_resp_per_prompt} \ + algorithm.adv_estimator=${adv_estimator} \ + algorithm.use_kl_in_reward=${use_kl_in_reward} \ + algorithm.kl_ctrl.kl_coef=${kl_coef} \ + actor_rollout_ref.actor.strategy=megatron \ + critic.strategy=megatron \ + actor_rollout_ref.actor.use_kl_loss=${use_kl_loss} \ + actor_rollout_ref.actor.kl_loss_coef=${kl_loss_coef} \ + actor_rollout_ref.actor.clip_ratio_low=${clip_ratio_low} \ + actor_rollout_ref.actor.clip_ratio_high=${clip_ratio_high} \ + actor_rollout_ref.actor.clip_ratio_c=10.0 \ + +actor_rollout_ref.model.override_config.max_position_embeddings=32768 \ + actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=2 \ + actor_rollout_ref.ref.log_prob_micro_batch_size_per_gpu=4 \ + actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu=4 \ + actor_rollout_ref.model.path="${MODEL_PATH}" \ + actor_rollout_ref.actor.optim.lr=1e-6 \ + actor_rollout_ref.actor.optim.lr_warmup_steps=10 \ + actor_rollout_ref.actor.optim.weight_decay=0.1 \ + actor_rollout_ref.actor.ppo_mini_batch_size=${train_prompt_mini_bsz} \ + actor_rollout_ref.actor.megatron.param_offload=${offload} \ + actor_rollout_ref.actor.megatron.optimizer_offload=${offload} \ + actor_rollout_ref.actor.megatron.grad_offload=${offload} \ + actor_rollout_ref.actor.megatron.pipeline_model_parallel_size=${train_pp} \ + actor_rollout_ref.actor.megatron.tensor_model_parallel_size=${train_tp} \ + actor_rollout_ref.actor.entropy_coeff=0 \ + actor_rollout_ref.actor.optim.clip_grad=1.0 \ + actor_rollout_ref.actor.loss_agg_mode=${loss_agg_mode} \ + actor_rollout_ref.rollout.gpu_memory_utilization=0.80 \ + actor_rollout_ref.rollout.tensor_model_parallel_size=${gen_tp} \ + actor_rollout_ref.rollout.enable_chunked_prefill=True \ + actor_rollout_ref.rollout.max_num_batched_tokens=$((max_prompt_length + max_response_length)) \ + actor_rollout_ref.rollout.temperature=${temperature} \ + actor_rollout_ref.rollout.top_p=${top_p} \ + actor_rollout_ref.rollout.top_k=${top_k} \ + actor_rollout_ref.rollout.val_kwargs.temperature=${temperature} \ + actor_rollout_ref.rollout.val_kwargs.top_p=${val_top_p} \ + actor_rollout_ref.rollout.val_kwargs.top_k=${top_k} \ + actor_rollout_ref.rollout.val_kwargs.do_sample=True \ + actor_rollout_ref.rollout.val_kwargs.n=1 \ + actor_rollout_ref.ref.megatron.pipeline_model_parallel_size=${train_pp} \ + actor_rollout_ref.ref.megatron.tensor_model_parallel_size=${train_tp} \ + actor_rollout_ref.ref.megatron.param_offload=${offload} \ + reward_model.reward_manager=dapo \ + +reward_model.reward_kwargs.overlong_buffer_cfg.enable=${enable_overlong_buffer} \ + +reward_model.reward_kwargs.overlong_buffer_cfg.len=${overlong_buffer_len} \ + +reward_model.reward_kwargs.overlong_buffer_cfg.penalty_factor=${overlong_penalty_factor} \ + +reward_model.reward_kwargs.overlong_buffer_cfg.log=False \ + +reward_model.reward_kwargs.max_resp_len=${max_response_length} \ + trainer.logger=['console','tensorboard'] \ + trainer.project_name="${project_name}" \ + trainer.experiment_name="${exp_name}" \ + trainer.n_gpus_per_node=8 \ + trainer.nnodes="${NNODES}" \ + trainer.val_before_train=True \ + trainer.test_freq=20 \ + trainer.save_freq=-1 \ + trainer.total_epochs=10 \ + trainer.total_training_steps=400 \ + trainer.default_local_dir="${CKPTS_DIR}" \ + trainer.resume_mode=auto \ + trainer.log_val_generations=10 diff --git a/recipe/fully_async_policy/exp/qwen2-7B-math_128/megatron_colocate/runtime_env.yaml b/recipe/fully_async_policy/exp/qwen2-7B-math_128/megatron_colocate/runtime_env.yaml new file mode 100644 index 00000000000..6e33f46a65a --- /dev/null +++ b/recipe/fully_async_policy/exp/qwen2-7B-math_128/megatron_colocate/runtime_env.yaml @@ -0,0 +1,3 @@ +env_vars: + TENSORBOARD_DIR: "/mnt/dolphinfs/ssd_pool/docker/user/hadoop-friday-studio/FTI/houzhenggang/expeirments/qwen2-7B-math-128/dapo_qwen2-7B-math_28k_megatron_colocate_128_mbs32" + HYDRA_FULL_ERROR: "1" \ No newline at end of file diff --git a/recipe/fully_async_policy/exp/qwen2-7B-math_32/megatron_colocate/dapo_7b_math_megatron_colocate.sh b/recipe/fully_async_policy/exp/qwen2-7B-math_32/megatron_colocate/dapo_7b_math_megatron_colocate.sh index 8bf1af32da8..3879a99df67 100644 --- a/recipe/fully_async_policy/exp/qwen2-7B-math_32/megatron_colocate/dapo_7b_math_megatron_colocate.sh +++ b/recipe/fully_async_policy/exp/qwen2-7B-math_32/megatron_colocate/dapo_7b_math_megatron_colocate.sh @@ -50,8 +50,8 @@ actor_ppo_max_token_len=$(((max_prompt_length + max_response_length) * 2)) infer_ppo_max_token_len=$(((max_prompt_length + max_response_length) * 3)) offload=True gen_tp=4 -train_tp=2 -train_pp=1 +train_tp=4 +train_pp=2 # TODO: support dynamic_bsz for megatron # actor_rollout_ref.actor.use_dynamic_bsz=${use_dynamic_bsz} \ diff --git a/recipe/fully_async_policy/exp/qwen2-7B-math_64/megatron_colocate/dapo_7b_math_megatron_colocate.sh b/recipe/fully_async_policy/exp/qwen2-7B-math_64/megatron_colocate/dapo_7b_math_megatron_colocate.sh index 7444ec90c99..f98aeb86b57 100644 --- a/recipe/fully_async_policy/exp/qwen2-7B-math_64/megatron_colocate/dapo_7b_math_megatron_colocate.sh +++ b/recipe/fully_async_policy/exp/qwen2-7B-math_64/megatron_colocate/dapo_7b_math_megatron_colocate.sh @@ -50,8 +50,8 @@ actor_ppo_max_token_len=$(((max_prompt_length + max_response_length) * 2)) infer_ppo_max_token_len=$(((max_prompt_length + max_response_length) * 3)) offload=True gen_tp=4 -train_tp=2 -train_pp=1 +train_tp=4 +train_pp=2 # TODO: support dynamic_bsz for megatron # actor_rollout_ref.actor.use_dynamic_bsz=${use_dynamic_bsz} \ diff --git a/recipe/fully_async_policy/exp/qwen3-30BA3B_128/fsdp2_colocate/runtime_env.yaml b/recipe/fully_async_policy/exp/qwen3-30BA3B_128/fsdp2_colocate/runtime_env.yaml new file mode 100644 index 00000000000..069b1f14aa0 --- /dev/null +++ b/recipe/fully_async_policy/exp/qwen3-30BA3B_128/fsdp2_colocate/runtime_env.yaml @@ -0,0 +1,3 @@ +env_vars: + TENSORBOARD_DIR: "/mnt/dolphinfs/ssd_pool/docker/user/hadoop-friday-studio/FTI/houzhenggang/expeirments/dapo_qwen3-30BA3B/dapo_qwen3-30BA3B-math_32k_fsdp2_colocate_128_mbs32" + HYDRA_FULL_ERROR: "1" \ No newline at end of file diff --git a/recipe/fully_async_policy/exp/qwen3-30BA3B_128/fsdp2_colocate/test_dapo_qwen3_30b_math.sh b/recipe/fully_async_policy/exp/qwen3-30BA3B_128/fsdp2_colocate/test_dapo_qwen3_30b_math.sh new file mode 100644 index 00000000000..591ac8533ee --- /dev/null +++ b/recipe/fully_async_policy/exp/qwen3-30BA3B_128/fsdp2_colocate/test_dapo_qwen3_30b_math.sh @@ -0,0 +1,125 @@ +#!/usr/bin/env bash +set -xeuo pipefail + +project_name='DAPO' +exp_name='dapo_qwen3-30BA3B-math_32k_fsdp2_colocate_128_mbs32' + +adv_estimator=grpo + +use_kl_in_reward=False +kl_coef=0.0 +use_kl_loss=False +kl_loss_coef=0.0 + +clip_ratio_low=0.2 +clip_ratio_high=0.28 + +max_prompt_length=$((1024 * 2)) +max_response_length=$((1024 * 32)) +enable_overlong_buffer=True +overlong_buffer_len=$((1024 * 4)) +overlong_penalty_factor=1.0 + +loss_agg_mode="token-mean" + +train_prompt_bsz=512 +n_resp_per_prompt=16 +train_prompt_mini_bsz=32 + +# Ray +# RAY_ADDRESS=${RAY_ADDRESS:-"http://localhost:8265"} +# WORKING_DIR=${WORKING_DIR:-"${PWD}"} +# RUNTIME_ENV=${RUNTIME_ENV:-"${WORKING_DIR}/verl/trainer/runtime_env.yaml"} +NNODES=${NNODES:-16} +NGPUS_PER_NODE=${NGPUS_PER_NODE:-8} +# Paths +MODEL_PATH=/mnt/dolphinfs/ssd_pool/docker/user/hadoop-friday-studio/.friday/models/basemodel/Qwen/Qwen3-30B-A3B +CKPTS_DIR=./ckpts/${project_name}/${exp_name} +TRAIN_FILE=/mnt/dolphinfs/ssd_pool/docker/user/hadoop-friday-studio/FTI/houzhenggang/data/dapo/dapo-math-17k.parquet +TEST_FILE=/mnt/dolphinfs/ssd_pool/docker/user/hadoop-friday-studio/FTI/houzhenggang/data/dapo/aime-2024.parquet + +# Algorithm +temperature=1.0 +top_p=1.0 +top_k=-1 # 0 for HF rollout, -1 for vLLM rollout +val_top_p=0.7 + +# Performance Related Parameter +sp_size=4 +use_dynamic_bsz=True +actor_ppo_max_token_len=$(((max_prompt_length + max_response_length) * 2)) +infer_ppo_max_token_len=$(((max_prompt_length + max_response_length) * 3)) +offload=True +gen_tp=4 +fsdp_size=32 + +python3 -m verl.trainer.main_ppo \ + data.train_files="${TRAIN_FILE}" \ + data.val_files="${TEST_FILE}" \ + data.prompt_key=prompt \ + data.truncation='left' \ + data.max_prompt_length=${max_prompt_length} \ + data.max_response_length=${max_response_length} \ + data.train_batch_size=${train_prompt_bsz} \ + actor_rollout_ref.rollout.n=${n_resp_per_prompt} \ + algorithm.adv_estimator=${adv_estimator} \ + algorithm.use_kl_in_reward=${use_kl_in_reward} \ + algorithm.kl_ctrl.kl_coef=${kl_coef} \ + actor_rollout_ref.actor.use_kl_loss=${use_kl_loss} \ + actor_rollout_ref.actor.kl_loss_coef=${kl_loss_coef} \ + actor_rollout_ref.actor.clip_ratio_low=${clip_ratio_low} \ + actor_rollout_ref.actor.clip_ratio_high=${clip_ratio_high} \ + actor_rollout_ref.actor.clip_ratio_c=10.0 \ + actor_rollout_ref.model.use_remove_padding=True \ + actor_rollout_ref.actor.use_dynamic_bsz=${use_dynamic_bsz} \ + actor_rollout_ref.ref.log_prob_use_dynamic_bsz=${use_dynamic_bsz} \ + actor_rollout_ref.rollout.log_prob_use_dynamic_bsz=${use_dynamic_bsz} \ + actor_rollout_ref.actor.ppo_max_token_len_per_gpu=${actor_ppo_max_token_len} \ + actor_rollout_ref.ref.log_prob_max_token_len_per_gpu=${infer_ppo_max_token_len} \ + actor_rollout_ref.rollout.log_prob_max_token_len_per_gpu=${infer_ppo_max_token_len} \ + actor_rollout_ref.model.path="${MODEL_PATH}" \ + actor_rollout_ref.model.enable_gradient_checkpointing=True \ + actor_rollout_ref.actor.optim.lr=1e-6 \ + actor_rollout_ref.actor.optim.lr_warmup_steps=10 \ + actor_rollout_ref.actor.optim.weight_decay=0.1 \ + actor_rollout_ref.actor.ppo_mini_batch_size=${train_prompt_mini_bsz} \ + actor_rollout_ref.actor.fsdp_config.param_offload=${offload} \ + actor_rollout_ref.actor.fsdp_config.optimizer_offload=${offload} \ + actor_rollout_ref.actor.entropy_coeff=0 \ + actor_rollout_ref.actor.grad_clip=1.0 \ + actor_rollout_ref.actor.loss_agg_mode=${loss_agg_mode} \ + actor_rollout_ref.actor.ulysses_sequence_parallel_size=${sp_size} \ + actor_rollout_ref.rollout.gpu_memory_utilization=0.80 \ + actor_rollout_ref.rollout.tensor_model_parallel_size=${gen_tp} \ + actor_rollout_ref.rollout.enable_chunked_prefill=True \ + actor_rollout_ref.rollout.max_num_batched_tokens=$((max_prompt_length + max_response_length)) \ + actor_rollout_ref.rollout.temperature=${temperature} \ + actor_rollout_ref.rollout.top_p=${top_p} \ + actor_rollout_ref.rollout.top_k=${top_k} \ + actor_rollout_ref.rollout.val_kwargs.temperature=${temperature} \ + actor_rollout_ref.rollout.val_kwargs.top_p=${val_top_p} \ + actor_rollout_ref.rollout.val_kwargs.top_k=${top_k} \ + actor_rollout_ref.rollout.val_kwargs.do_sample=True \ + actor_rollout_ref.rollout.val_kwargs.n=1 \ + actor_rollout_ref.ref.fsdp_config.param_offload=${offload} \ + actor_rollout_ref.ref.ulysses_sequence_parallel_size=${sp_size} \ + actor_rollout_ref.actor.fsdp_config.fsdp_size=${fsdp_size} \ + reward_model.reward_manager=dapo \ + +reward_model.reward_kwargs.overlong_buffer_cfg.enable=${enable_overlong_buffer} \ + +reward_model.reward_kwargs.overlong_buffer_cfg.len=${overlong_buffer_len} \ + +reward_model.reward_kwargs.overlong_buffer_cfg.penalty_factor=${overlong_penalty_factor} \ + +reward_model.reward_kwargs.overlong_buffer_cfg.log=False \ + +reward_model.reward_kwargs.max_resp_len=${max_response_length} \ + trainer.logger='["console","tensorboard"]' \ + trainer.project_name="${project_name}" \ + trainer.experiment_name="${exp_name}" \ + trainer.n_gpus_per_node="${NGPUS_PER_NODE}" \ + trainer.nnodes="${NNODES}" \ + trainer.val_before_train=True \ + trainer.test_freq=20 \ + trainer.save_freq=-1 \ + trainer.total_epochs=10 \ + trainer.total_training_steps=400 \ + trainer.default_local_dir="${CKPTS_DIR}" \ + trainer.resume_mode=auto \ + trainer.log_val_generations=10 diff --git a/recipe/fully_async_policy/exp/qwen3-32B_128/fsdp2_colocate/fsdp2.sh b/recipe/fully_async_policy/exp/qwen3-32B_128/fsdp2_colocate/fsdp2.sh new file mode 100644 index 00000000000..8f2e636c59f --- /dev/null +++ b/recipe/fully_async_policy/exp/qwen3-32B_128/fsdp2_colocate/fsdp2.sh @@ -0,0 +1,125 @@ +#!/usr/bin/env bash +set -xeuo pipefail + +project_name='DAPO' +exp_name='dapo_qwen3-32B-math_32k_fsdp2_colocate_128_mbs32' + +adv_estimator=grpo + +use_kl_in_reward=False +kl_coef=0.0 +use_kl_loss=False +kl_loss_coef=0.0 + +clip_ratio_low=0.2 +clip_ratio_high=0.28 + +max_prompt_length=$((1024 * 2)) +max_response_length=$((1024 * 32)) +enable_overlong_buffer=True +overlong_buffer_len=$((1024 * 4)) +overlong_penalty_factor=1.0 + +loss_agg_mode="token-mean" + +train_prompt_bsz=512 +n_resp_per_prompt=16 +train_prompt_mini_bsz=32 + +# Ray +# RAY_ADDRESS=${RAY_ADDRESS:-"http://localhost:8265"} +# WORKING_DIR=${WORKING_DIR:-"${PWD}"} +# RUNTIME_ENV=${RUNTIME_ENV:-"${WORKING_DIR}/verl/trainer/runtime_env.yaml"} +NNODES=${NNODES:-16} +NGPUS_PER_NODE=${NGPUS_PER_NODE:-8} +# Paths +MODEL_PATH=/mnt/dolphinfs/ssd_pool/docker/user/hadoop-friday-studio/.friday/models/basemodel/Qwen/Qwen3-32B +CKPTS_DIR=./ckpts/${project_name}/${exp_name} +TRAIN_FILE=/mnt/dolphinfs/ssd_pool/docker/user/hadoop-friday-studio/FTI/houzhenggang/data/dapo/dapo-math-17k.parquet +TEST_FILE=/mnt/dolphinfs/ssd_pool/docker/user/hadoop-friday-studio/FTI/houzhenggang/data/dapo/aime-2024.parquet + +# Algorithm +temperature=1.0 +top_p=1.0 +top_k=-1 # 0 for HF rollout, -1 for vLLM rollout +val_top_p=0.7 + +# Performance Related Parameter +sp_size=4 +use_dynamic_bsz=True +actor_ppo_max_token_len=$(((max_prompt_length + max_response_length) * 2)) +infer_ppo_max_token_len=$(((max_prompt_length + max_response_length) * 3)) +offload=True +gen_tp=4 +fsdp_size=32 + +python3 -m verl.trainer.main_ppo \ + data.train_files="${TRAIN_FILE}" \ + data.val_files="${TEST_FILE}" \ + data.prompt_key=prompt \ + data.truncation='left' \ + data.max_prompt_length=${max_prompt_length} \ + data.max_response_length=${max_response_length} \ + data.train_batch_size=${train_prompt_bsz} \ + actor_rollout_ref.rollout.n=${n_resp_per_prompt} \ + algorithm.adv_estimator=${adv_estimator} \ + algorithm.use_kl_in_reward=${use_kl_in_reward} \ + algorithm.kl_ctrl.kl_coef=${kl_coef} \ + actor_rollout_ref.actor.use_kl_loss=${use_kl_loss} \ + actor_rollout_ref.actor.kl_loss_coef=${kl_loss_coef} \ + actor_rollout_ref.actor.clip_ratio_low=${clip_ratio_low} \ + actor_rollout_ref.actor.clip_ratio_high=${clip_ratio_high} \ + actor_rollout_ref.actor.clip_ratio_c=10.0 \ + actor_rollout_ref.model.use_remove_padding=True \ + actor_rollout_ref.actor.use_dynamic_bsz=${use_dynamic_bsz} \ + actor_rollout_ref.ref.log_prob_use_dynamic_bsz=${use_dynamic_bsz} \ + actor_rollout_ref.rollout.log_prob_use_dynamic_bsz=${use_dynamic_bsz} \ + actor_rollout_ref.actor.ppo_max_token_len_per_gpu=${actor_ppo_max_token_len} \ + actor_rollout_ref.ref.log_prob_max_token_len_per_gpu=${infer_ppo_max_token_len} \ + actor_rollout_ref.rollout.log_prob_max_token_len_per_gpu=${infer_ppo_max_token_len} \ + actor_rollout_ref.model.path="${MODEL_PATH}" \ + actor_rollout_ref.model.enable_gradient_checkpointing=True \ + actor_rollout_ref.actor.optim.lr=1e-6 \ + actor_rollout_ref.actor.optim.lr_warmup_steps=10 \ + actor_rollout_ref.actor.optim.weight_decay=0.1 \ + actor_rollout_ref.actor.ppo_mini_batch_size=${train_prompt_mini_bsz} \ + actor_rollout_ref.actor.fsdp_config.param_offload=${offload} \ + actor_rollout_ref.actor.fsdp_config.optimizer_offload=${offload} \ + actor_rollout_ref.actor.entropy_coeff=0 \ + actor_rollout_ref.actor.grad_clip=1.0 \ + actor_rollout_ref.actor.loss_agg_mode=${loss_agg_mode} \ + actor_rollout_ref.actor.ulysses_sequence_parallel_size=${sp_size} \ + actor_rollout_ref.rollout.gpu_memory_utilization=0.80 \ + actor_rollout_ref.rollout.tensor_model_parallel_size=${gen_tp} \ + actor_rollout_ref.rollout.enable_chunked_prefill=True \ + actor_rollout_ref.rollout.max_num_batched_tokens=$((max_prompt_length + max_response_length)) \ + actor_rollout_ref.rollout.temperature=${temperature} \ + actor_rollout_ref.rollout.top_p=${top_p} \ + actor_rollout_ref.rollout.top_k=${top_k} \ + actor_rollout_ref.rollout.val_kwargs.temperature=${temperature} \ + actor_rollout_ref.rollout.val_kwargs.top_p=${val_top_p} \ + actor_rollout_ref.rollout.val_kwargs.top_k=${top_k} \ + actor_rollout_ref.rollout.val_kwargs.do_sample=True \ + actor_rollout_ref.rollout.val_kwargs.n=1 \ + actor_rollout_ref.ref.fsdp_config.param_offload=${offload} \ + actor_rollout_ref.ref.ulysses_sequence_parallel_size=${sp_size} \ + actor_rollout_ref.actor.fsdp_config.fsdp_size=${fsdp_size} \ + reward_model.reward_manager=dapo \ + +reward_model.reward_kwargs.overlong_buffer_cfg.enable=${enable_overlong_buffer} \ + +reward_model.reward_kwargs.overlong_buffer_cfg.len=${overlong_buffer_len} \ + +reward_model.reward_kwargs.overlong_buffer_cfg.penalty_factor=${overlong_penalty_factor} \ + +reward_model.reward_kwargs.overlong_buffer_cfg.log=False \ + +reward_model.reward_kwargs.max_resp_len=${max_response_length} \ + trainer.logger='["console","tensorboard"]' \ + trainer.project_name="${project_name}" \ + trainer.experiment_name="${exp_name}" \ + trainer.n_gpus_per_node="${NGPUS_PER_NODE}" \ + trainer.nnodes="${NNODES}" \ + trainer.val_before_train=True \ + trainer.test_freq=20 \ + trainer.save_freq=-1 \ + trainer.total_epochs=10 \ + trainer.total_training_steps=400 \ + trainer.default_local_dir="${CKPTS_DIR}" \ + trainer.resume_mode=auto \ + trainer.log_val_generations=10 diff --git a/recipe/fully_async_policy/exp/qwen3-32B_128/fsdp2_colocate/runtime_env.yaml b/recipe/fully_async_policy/exp/qwen3-32B_128/fsdp2_colocate/runtime_env.yaml new file mode 100644 index 00000000000..1b4a8ff4b82 --- /dev/null +++ b/recipe/fully_async_policy/exp/qwen3-32B_128/fsdp2_colocate/runtime_env.yaml @@ -0,0 +1,3 @@ +env_vars: + TENSORBOARD_DIR: "/mnt/dolphinfs/ssd_pool/docker/user/hadoop-friday-studio/FTI/houzhenggang/expeirments/dapo_qwen3-32B/dapo_qwen3-32B-math_32k_fsdp2_colocate_128_mbs32" + HYDRA_FULL_ERROR: "1" \ No newline at end of file From 3c239be789d18c14023d0b1b7c12f2857a726a42 Mon Sep 17 00:00:00 2001 From: wangshulin02 <953550366@qq.com> Date: Sun, 7 Sep 2025 11:51:15 +0800 Subject: [PATCH 107/182] fix typo in use_rollout_log_probs --- recipe/fully_async_policy/detach_utils.py | 2 +- verl/trainer/ppo/ray_trainer.py | 3 +-- 2 files changed, 2 insertions(+), 3 deletions(-) diff --git a/recipe/fully_async_policy/detach_utils.py b/recipe/fully_async_policy/detach_utils.py index e01d82c1726..31738234c09 100644 --- a/recipe/fully_async_policy/detach_utils.py +++ b/recipe/fully_async_policy/detach_utils.py @@ -358,7 +358,7 @@ def get_aggregated_metrics(self) -> Dict[str, Any]: # Aggregate special metrics aggregated = self._special_metrics_aggergate(aggregated) - print(f"******************************aggregated metrics done. cost {time.time() - t}") + print(f"aggregated metrics done. cost {time.time() - t}") return aggregated diff --git a/verl/trainer/ppo/ray_trainer.py b/verl/trainer/ppo/ray_trainer.py index 2d5a0538616..8d2c19d3364 100644 --- a/verl/trainer/ppo/ray_trainer.py +++ b/verl/trainer/ppo/ray_trainer.py @@ -1248,12 +1248,11 @@ def _process_batch_common(self, batch, metrics, timing_raw): # recompute old_log_probs with marked_timer("old_log_prob", timing_raw, color="blue"): async_training = self.config.get("async_training", None) - if async_training and async_training.use_rollout_log_prob: + if async_training and async_training.use_rollout_log_probs: batch.batch["old_log_probs"] = batch.batch["rollout_log_probs"] batch.meta_info["temperature"] = self.config.actor_rollout_ref.rollout.temperature else: - old_log_prob = self.actor_rollout_wg.compute_log_prob(batch) entropys = old_log_prob.batch["entropys"] response_masks = batch.batch["response_mask"] From 9cbce52395fa01ccf936f12828e1b1853cf1b964 Mon Sep 17 00:00:00 2001 From: wangshulin02 <953550366@qq.com> Date: Sun, 7 Sep 2025 12:42:42 +0800 Subject: [PATCH 108/182] remove unused code --- recipe/fully_async_policy/detach_utils.py | 7 +------ 1 file changed, 1 insertion(+), 6 deletions(-) diff --git a/recipe/fully_async_policy/detach_utils.py b/recipe/fully_async_policy/detach_utils.py index 31738234c09..127afca6881 100644 --- a/recipe/fully_async_policy/detach_utils.py +++ b/recipe/fully_async_policy/detach_utils.py @@ -254,12 +254,7 @@ def _init_aggregation_rules(self) -> Dict[str, Dict[str, List[str]]]: return { # Time-Based metrics, can add metrics here 'time_sum': [ - 'timing_s/adv', - 'timing_s/gen', - 'timing_s/old_log_prob', - 'timing_s/reward', - 'timing_s/step', - 'timing_s/update_actor', + 'perf/time_per_step' ], } From 5539e037035a71bb93dd892fd6f28ac0af75afe7 Mon Sep 17 00:00:00 2001 From: wangshulin02 <953550366@qq.com> Date: Mon, 8 Sep 2025 15:21:43 +0800 Subject: [PATCH 109/182] add exp fully_async 32, 64 --- .../dapo_7b_math_fsdp2_colocate.sh | 133 ------------- .../fsdp2_colocate_64/runtime_env.yaml | 3 - ...8k_fsdp2_fully-async_16-16_mbs32_tfq16.sh} | 36 ++-- .../fsdp2_fully-async_16-16/runtime_env.yaml | 2 +- ...28k_fsdp2_fully-async_24-8_mbs32_tfq32.sh} | 36 ++-- .../fsdp2_fully-async_24-8/runtime_env.yaml | 2 +- ...28k_fsdp2_fully-async_8-24_mbs32_tfq11.sh} | 36 ++-- .../fsdp2_fully-async_8-24/runtime_env.yaml | 2 +- .../fsdp2_fully-async_16-16/runtime_env.yaml | 5 - ...28k_fsdp2_fully-async_24-40_mbs32_tfq6.sh} | 36 ++-- .../fsdp2_fully-async_24-40/runtime_env.yaml | 5 + ..._fsdp2_fully-async_8-24_mbs32_tpf_tfq20.sh | 174 ------------------ .../fsdp2_fully-async_24-8/runtime_env.yaml | 5 - ..._28k_fsdp2_fully-async_32-32_mbs32_tfq8.sh | 168 +++++++++++++++++ .../fsdp2_fully-async_32-32/runtime_env.yaml | 5 + ...28k_fsdp2_fully-async_40-24_mbs32_tfq11.sh | 168 +++++++++++++++++ .../fsdp2_fully-async_40-24/runtime_env.yaml | 5 + ..._fsdp2_fully-async_8-24_mbs32_tpf_tfq20.sh | 174 ------------------ .../fsdp2_fully-async_8-24/runtime_env.yaml | 5 - 19 files changed, 414 insertions(+), 586 deletions(-) delete mode 100644 recipe/fully_async_policy/exp/qwen2-7B-math_32/fsdp2_colocate_64/dapo_7b_math_fsdp2_colocate.sh delete mode 100644 recipe/fully_async_policy/exp/qwen2-7B-math_32/fsdp2_colocate_64/runtime_env.yaml rename recipe/fully_async_policy/exp/qwen2-7B-math_32/{fsdp2_fully-async_8-24/dapo_qwen2-7B-math_28k_fsdp2_fully-async_8-24_mbs32_tpf_tfq20.sh => fsdp2_fully-async_16-16/dapo_qwen2-7B-math_28k_fsdp2_fully-async_16-16_mbs32_tfq16.sh} (90%) rename recipe/fully_async_policy/exp/qwen2-7B-math_32/{fsdp2_fully-async_16-16/dapo_qwen2-7B-math_28k_fsdp2_fully-async_8-24_mbs32_tpf_tfq20.sh => fsdp2_fully-async_24-8/dapo_qwen2-7B-math_28k_fsdp2_fully-async_24-8_mbs32_tfq32.sh} (90%) rename recipe/fully_async_policy/exp/{qwen2-7B-math_64/fsdp2_fully-async_16-16/dapo_qwen2-7B-math_28k_fsdp2_fully-async_8-24_mbs32_tpf_tfq20.sh => qwen2-7B-math_32/fsdp2_fully-async_8-24/dapo_qwen2-7B-math_28k_fsdp2_fully-async_8-24_mbs32_tfq11.sh} (91%) delete mode 100644 recipe/fully_async_policy/exp/qwen2-7B-math_64/fsdp2_fully-async_16-16/runtime_env.yaml rename recipe/fully_async_policy/exp/{qwen2-7B-math_32/fsdp2_fully-async_24-8/dapo_qwen2-7B-math_28k_fsdp2_fully-async_8-24_mbs32_tpf_tfq20.sh => qwen2-7B-math_64/fsdp2_fully-async_24-40/dapo_qwen2-7B-math_28k_fsdp2_fully-async_24-40_mbs32_tfq6.sh} (90%) create mode 100644 recipe/fully_async_policy/exp/qwen2-7B-math_64/fsdp2_fully-async_24-40/runtime_env.yaml delete mode 100644 recipe/fully_async_policy/exp/qwen2-7B-math_64/fsdp2_fully-async_24-8/dapo_qwen2-7B-math_28k_fsdp2_fully-async_8-24_mbs32_tpf_tfq20.sh delete mode 100644 recipe/fully_async_policy/exp/qwen2-7B-math_64/fsdp2_fully-async_24-8/runtime_env.yaml create mode 100644 recipe/fully_async_policy/exp/qwen2-7B-math_64/fsdp2_fully-async_32-32/dapo_qwen2-7B-math_28k_fsdp2_fully-async_32-32_mbs32_tfq8.sh create mode 100644 recipe/fully_async_policy/exp/qwen2-7B-math_64/fsdp2_fully-async_32-32/runtime_env.yaml create mode 100644 recipe/fully_async_policy/exp/qwen2-7B-math_64/fsdp2_fully-async_40-24/dapo_qwen2-7B-math_28k_fsdp2_fully-async_40-24_mbs32_tfq11.sh create mode 100644 recipe/fully_async_policy/exp/qwen2-7B-math_64/fsdp2_fully-async_40-24/runtime_env.yaml delete mode 100644 recipe/fully_async_policy/exp/qwen2-7B-math_64/fsdp2_fully-async_8-24/dapo_qwen2-7B-math_28k_fsdp2_fully-async_8-24_mbs32_tpf_tfq20.sh delete mode 100644 recipe/fully_async_policy/exp/qwen2-7B-math_64/fsdp2_fully-async_8-24/runtime_env.yaml diff --git a/recipe/fully_async_policy/exp/qwen2-7B-math_32/fsdp2_colocate_64/dapo_7b_math_fsdp2_colocate.sh b/recipe/fully_async_policy/exp/qwen2-7B-math_32/fsdp2_colocate_64/dapo_7b_math_fsdp2_colocate.sh deleted file mode 100644 index 8d42dca04ca..00000000000 --- a/recipe/fully_async_policy/exp/qwen2-7B-math_32/fsdp2_colocate_64/dapo_7b_math_fsdp2_colocate.sh +++ /dev/null @@ -1,133 +0,0 @@ -#!/usr/bin/env bash -set -xeuo pipefail - -project_name='DAPO' -exp_name='dapo_qwen2-7B-math_28k_fsdp2_colocate_32_mbs32' - -adv_estimator=grpo - -use_kl_in_reward=False -kl_coef=0.0 -use_kl_loss=False -kl_loss_coef=0.0 - -clip_ratio_low=0.2 -clip_ratio_high=0.28 - -max_prompt_length=$((1024 * 2)) -max_response_length=$((1024 * 28)) -enable_overlong_buffer=True -overlong_buffer_len=$((1024 * 4)) -overlong_penalty_factor=1.0 - -loss_agg_mode="token-mean" - -train_prompt_bsz=512 -n_resp_per_prompt=16 -train_prompt_mini_bsz=32 - -# Ray -# RAY_ADDRESS=${RAY_ADDRESS:-"http://localhost:8265"} -# WORKING_DIR=${WORKING_DIR:-"${PWD}"} -# RUNTIME_ENV=${RUNTIME_ENV:-"${WORKING_DIR}/verl/trainer/runtime_env.yaml"} -NNODES=${NNODES:-4} -NGPUS_PER_NODE=${NGPUS_PER_NODE:-8} -# Paths -RAY_DATA_HOME=${RAY_DATA_HOME:-"${HOME}/verl"} -# very important! please modify the max_position_embeddings in config.json to 32768 after downloading from huggingface - -MODEL_PATH=/mnt/dolphinfs/ssd_pool/docker/user/hadoop-friday-studio/FTI/houzhenggang/model/Qwen2___5-Math-7B -CKPTS_DIR=./ckpts/${project_name}/${exp_name} -TRAIN_FILE=/mnt/dolphinfs/ssd_pool/docker/user/hadoop-friday-studio/FTI/houzhenggang/data/dapo/dapo-math-17k.parquet -TEST_FILE=/mnt/dolphinfs/ssd_pool/docker/user/hadoop-friday-studio/FTI/houzhenggang/data/dapo/aime-2024.parquet -# Algorithm -temperature=1.0 -top_p=1.0 -top_k=-1 # 0 for HF rollout, -1 for vLLM rollout -val_top_p=0.7 - -# Performance Related Parameter -use_dynamic_bsz=True -actor_ppo_max_token_len=$(((max_prompt_length + max_response_length) * 2)) -infer_ppo_max_token_len=$(((max_prompt_length + max_response_length) * 3)) -offload=True -gen_tp=4 -sp_size=4 -fsdp_size=2 - -# reference run wandb: https://wandb.ai/verl-org/DAPO%20Reproduction%20on%20verl/runs/ow47vvon?nw=nwusertongyuxuan361 - - -python -m verl.trainer.main_ppo \ - data.train_files="${TRAIN_FILE}" \ - data.val_files="${TEST_FILE}" \ - data.prompt_key=prompt \ - data.truncation='left' \ - data.max_prompt_length=${max_prompt_length} \ - data.max_response_length=${max_response_length} \ - data.train_batch_size=${train_prompt_bsz} \ - actor_rollout_ref.rollout.n=${n_resp_per_prompt} \ - algorithm.adv_estimator=${adv_estimator} \ - algorithm.use_kl_in_reward=${use_kl_in_reward} \ - algorithm.kl_ctrl.kl_coef=${kl_coef} \ - actor_rollout_ref.actor.strategy=fsdp2 \ - critic.strategy=fsdp2 \ - actor_rollout_ref.actor.use_kl_loss=${use_kl_loss} \ - actor_rollout_ref.actor.kl_loss_coef=${kl_loss_coef} \ - actor_rollout_ref.actor.clip_ratio_low=${clip_ratio_low} \ - actor_rollout_ref.actor.clip_ratio_high=${clip_ratio_high} \ - actor_rollout_ref.actor.clip_ratio_c=10.0 \ - actor_rollout_ref.model.use_remove_padding=True \ - +actor_rollout_ref.model.override_config.max_position_embeddings=32768 \ - actor_rollout_ref.actor.use_dynamic_bsz=${use_dynamic_bsz} \ - actor_rollout_ref.ref.log_prob_use_dynamic_bsz=${use_dynamic_bsz} \ - actor_rollout_ref.rollout.log_prob_use_dynamic_bsz=${use_dynamic_bsz} \ - actor_rollout_ref.actor.ppo_max_token_len_per_gpu=${actor_ppo_max_token_len} \ - actor_rollout_ref.ref.log_prob_max_token_len_per_gpu=${infer_ppo_max_token_len} \ - actor_rollout_ref.rollout.log_prob_max_token_len_per_gpu=${infer_ppo_max_token_len} \ - actor_rollout_ref.model.path="${MODEL_PATH}" \ - actor_rollout_ref.model.enable_gradient_checkpointing=True \ - actor_rollout_ref.actor.optim.lr=1e-6 \ - actor_rollout_ref.actor.optim.lr_warmup_steps=10 \ - actor_rollout_ref.actor.optim.weight_decay=0.1 \ - actor_rollout_ref.actor.ppo_mini_batch_size=${train_prompt_mini_bsz} \ - actor_rollout_ref.actor.fsdp_config.param_offload=${offload} \ - actor_rollout_ref.actor.fsdp_config.optimizer_offload=${offload} \ - actor_rollout_ref.actor.entropy_coeff=0 \ - actor_rollout_ref.actor.grad_clip=1.0 \ - actor_rollout_ref.actor.loss_agg_mode=${loss_agg_mode} \ - actor_rollout_ref.actor.ulysses_sequence_parallel_size=${sp_size} \ - actor_rollout_ref.rollout.gpu_memory_utilization=0.80 \ - actor_rollout_ref.rollout.tensor_model_parallel_size=${gen_tp} \ - actor_rollout_ref.rollout.enable_chunked_prefill=True \ - actor_rollout_ref.rollout.max_num_batched_tokens=$((max_prompt_length + max_response_length)) \ - actor_rollout_ref.rollout.temperature=${temperature} \ - actor_rollout_ref.rollout.top_p=${top_p} \ - actor_rollout_ref.rollout.top_k=${top_k} \ - actor_rollout_ref.rollout.val_kwargs.temperature=${temperature} \ - actor_rollout_ref.rollout.val_kwargs.top_p=${val_top_p} \ - actor_rollout_ref.rollout.val_kwargs.top_k=${top_k} \ - actor_rollout_ref.rollout.val_kwargs.do_sample=True \ - actor_rollout_ref.rollout.val_kwargs.n=1 \ - actor_rollout_ref.ref.fsdp_config.param_offload=${offload} \ - actor_rollout_ref.ref.ulysses_sequence_parallel_size=${sp_size} \ - actor_rollout_ref.actor.fsdp_config.fsdp_size=${fsdp_size} \ - reward_model.reward_manager=dapo \ - +reward_model.reward_kwargs.overlong_buffer_cfg.enable=${enable_overlong_buffer} \ - +reward_model.reward_kwargs.overlong_buffer_cfg.len=${overlong_buffer_len} \ - +reward_model.reward_kwargs.overlong_buffer_cfg.penalty_factor=${overlong_penalty_factor} \ - +reward_model.reward_kwargs.overlong_buffer_cfg.log=False \ - +reward_model.reward_kwargs.max_resp_len=${max_response_length} \ - trainer.logger=['console','tensorboard'] \ - trainer.project_name="${project_name}" \ - trainer.experiment_name="${exp_name}" \ - trainer.n_gpus_per_node="${NGPUS_PER_NODE}" \ - trainer.nnodes="${NNODES}" \ - trainer.val_before_train=True \ - trainer.test_freq=20 \ - trainer.save_freq=-1 \ - trainer.total_epochs=10 \ - trainer.total_training_steps=400 \ - trainer.default_local_dir="${CKPTS_DIR}" \ - trainer.resume_mode=auto \ - trainer.log_val_generations=10 diff --git a/recipe/fully_async_policy/exp/qwen2-7B-math_32/fsdp2_colocate_64/runtime_env.yaml b/recipe/fully_async_policy/exp/qwen2-7B-math_32/fsdp2_colocate_64/runtime_env.yaml deleted file mode 100644 index 39c5a3593e8..00000000000 --- a/recipe/fully_async_policy/exp/qwen2-7B-math_32/fsdp2_colocate_64/runtime_env.yaml +++ /dev/null @@ -1,3 +0,0 @@ -env_vars: - TENSORBOARD_DIR: "/mnt/dolphinfs/ssd_pool/docker/user/hadoop-friday-studio/FTI/houzhenggang/expeirments/qwen2-7B-math/dapo_qwen2-7B-math_28k_fsdp2_colocate_32_mbs32" - HYDRA_FULL_ERROR: "1" \ No newline at end of file diff --git a/recipe/fully_async_policy/exp/qwen2-7B-math_32/fsdp2_fully-async_8-24/dapo_qwen2-7B-math_28k_fsdp2_fully-async_8-24_mbs32_tpf_tfq20.sh b/recipe/fully_async_policy/exp/qwen2-7B-math_32/fsdp2_fully-async_16-16/dapo_qwen2-7B-math_28k_fsdp2_fully-async_16-16_mbs32_tfq16.sh similarity index 90% rename from recipe/fully_async_policy/exp/qwen2-7B-math_32/fsdp2_fully-async_8-24/dapo_qwen2-7B-math_28k_fsdp2_fully-async_8-24_mbs32_tpf_tfq20.sh rename to recipe/fully_async_policy/exp/qwen2-7B-math_32/fsdp2_fully-async_16-16/dapo_qwen2-7B-math_28k_fsdp2_fully-async_16-16_mbs32_tfq16.sh index 618497c0257..c49a6460696 100644 --- a/recipe/fully_async_policy/exp/qwen2-7B-math_32/fsdp2_fully-async_8-24/dapo_qwen2-7B-math_28k_fsdp2_fully-async_8-24_mbs32_tpf_tfq20.sh +++ b/recipe/fully_async_policy/exp/qwen2-7B-math_32/fsdp2_fully-async_16-16/dapo_qwen2-7B-math_28k_fsdp2_fully-async_16-16_mbs32_tfq16.sh @@ -1,8 +1,8 @@ #!/usr/bin/env bash set -xeuo pipefail -# dapo_qwen2-7B-math_28k_fsdp2_fully-async_8-24_mbs32_tpf_tfq20 + project_name='DAPO' -exp_name='dapo_qwen2-7B-math_28k_fsdp2_fully-async_8-24_mbs32_tpf_tfq20' +exp_name='dapo_qwen2-7B-math_28k_fsdp2_fully-async_16-16_mbs32_tpf16' # Ray # RAY_ADDRESS=${RAY_ADDRESS:-"http://localhost:8265"} @@ -44,7 +44,7 @@ clip_ratio_high=0.28 # Response length parameters max_prompt_length=$((1024 * 2)) -max_response_length=$((1024 *8)) +max_response_length=$((1024 * 28)) enable_overlong_buffer=True overlong_buffer_len=$((1024 * 4)) overlong_penalty_factor=1.0 @@ -64,33 +64,27 @@ actor_ppo_max_token_len=$(((max_prompt_length + max_response_length) * 2)) infer_ppo_max_token_len=$(((max_prompt_length + max_response_length) * 3)) ref_offload=True actor_offload=False -gen_tp=1 -sp_size=1 +gen_tp=4 +sp_size=4 fsdp_size=2 # Fully async specific parameters -NNODES=${NNODES:-2} +NNODES_ROLLOUT=${NNODES_ROLLOUT:-2} +NNODES_TRAIN=${NNODES_TRAIN:-2} NGPUS_PER_NODE=${NGPUS_PER_NODE:-8} -n_gpus_rollout=6 -n_gpus_training=$((NGPUS_PER_NODE - n_gpus_rollout)) train_prompt_bsz=0 gen_prompt_bsz=1 n_resp_per_prompt=16 train_prompt_mini_bsz=32 -total_rollout_steps=$(((512*100))) -test_freq=10 +total_rollout_steps=$(((512*400))) +test_freq=20 staleness_threshold=1 -trigger_parameter_sync_step=64 +trigger_parameter_sync_step=16 partial_rollout=True -PYTHON_INTERPRETER="/home/hadoop-djst-algoplat/miniconda3/bin/python" -if [ ! -x "$PYTHON_INTERPRETER" ]; then - PYTHON_INTERPRETER="python3" -fi - -$PYTHON_INTERPRETER -m recipe.fully_async_policy.fully_async_main \ +python -m recipe.fully_async_policy.fully_async_main \ data.train_files="${TRAIN_FILE}" \ data.val_files="${TEST_FILE}" \ data.prompt_key=prompt \ @@ -162,10 +156,10 @@ $PYTHON_INTERPRETER -m recipe.fully_async_policy.fully_async_main \ trainer.save_freq=-1 \ trainer.default_local_dir="${CKPTS_DIR}" \ trainer.resume_mode=auto \ - trainer.nnodes="${NNODES}" \ - trainer.n_gpus_per_node="${n_gpus_training}" \ - rollout.nnodes="${NNODES}" \ - rollout.n_gpus_per_node="${n_gpus_rollout}" \ + trainer.nnodes="${NNODES_TRAIN}" \ + trainer.n_gpus_per_node="${NGPUS_PER_NODE}" \ + rollout.nnodes="${NNODES_ROLLOUT}" \ + rollout.n_gpus_per_node="${NGPUS_PER_NODE}" \ rollout.total_rollout_steps="${total_rollout_steps}" \ rollout.total_epochs=10 \ rollout.test_freq="${test_freq}" \ diff --git a/recipe/fully_async_policy/exp/qwen2-7B-math_32/fsdp2_fully-async_16-16/runtime_env.yaml b/recipe/fully_async_policy/exp/qwen2-7B-math_32/fsdp2_fully-async_16-16/runtime_env.yaml index dcca08e67f7..de7e1aa0e1c 100644 --- a/recipe/fully_async_policy/exp/qwen2-7B-math_32/fsdp2_fully-async_16-16/runtime_env.yaml +++ b/recipe/fully_async_policy/exp/qwen2-7B-math_32/fsdp2_fully-async_16-16/runtime_env.yaml @@ -1,5 +1,5 @@ env_vars: VLLM_USE_V1: "1" - TENSORBOARD_DIR: "/mnt/dolphinfs/hdd_pool/docker/user/hadoop-djst-algoplat/houzhenggang/tensorboard/dapo_7b_math_fsdp2_4_12" + TENSORBOARD_DIR: "/mnt/dolphinfs/ssd_pool/docker/user/hadoop-friday-studio/FTI/houzhenggang/expeirments/qwen2-7B-math/dapo_qwen2-7B-math_28k_fsdp2_fully-async_16-16_mbs32_tpf16" NCCL_DEBUG: "INFO" HYDRA_FULL_ERROR: "1" \ No newline at end of file diff --git a/recipe/fully_async_policy/exp/qwen2-7B-math_32/fsdp2_fully-async_16-16/dapo_qwen2-7B-math_28k_fsdp2_fully-async_8-24_mbs32_tpf_tfq20.sh b/recipe/fully_async_policy/exp/qwen2-7B-math_32/fsdp2_fully-async_24-8/dapo_qwen2-7B-math_28k_fsdp2_fully-async_24-8_mbs32_tfq32.sh similarity index 90% rename from recipe/fully_async_policy/exp/qwen2-7B-math_32/fsdp2_fully-async_16-16/dapo_qwen2-7B-math_28k_fsdp2_fully-async_8-24_mbs32_tpf_tfq20.sh rename to recipe/fully_async_policy/exp/qwen2-7B-math_32/fsdp2_fully-async_24-8/dapo_qwen2-7B-math_28k_fsdp2_fully-async_24-8_mbs32_tfq32.sh index 618497c0257..6c6cb13cf45 100644 --- a/recipe/fully_async_policy/exp/qwen2-7B-math_32/fsdp2_fully-async_16-16/dapo_qwen2-7B-math_28k_fsdp2_fully-async_8-24_mbs32_tpf_tfq20.sh +++ b/recipe/fully_async_policy/exp/qwen2-7B-math_32/fsdp2_fully-async_24-8/dapo_qwen2-7B-math_28k_fsdp2_fully-async_24-8_mbs32_tfq32.sh @@ -1,8 +1,8 @@ #!/usr/bin/env bash set -xeuo pipefail -# dapo_qwen2-7B-math_28k_fsdp2_fully-async_8-24_mbs32_tpf_tfq20 + project_name='DAPO' -exp_name='dapo_qwen2-7B-math_28k_fsdp2_fully-async_8-24_mbs32_tpf_tfq20' +exp_name='dapo_qwen2-7B-math_28k_fsdp2_fully-async_24-8_mbs32_tpf32' # Ray # RAY_ADDRESS=${RAY_ADDRESS:-"http://localhost:8265"} @@ -44,7 +44,7 @@ clip_ratio_high=0.28 # Response length parameters max_prompt_length=$((1024 * 2)) -max_response_length=$((1024 *8)) +max_response_length=$((1024 * 28)) enable_overlong_buffer=True overlong_buffer_len=$((1024 * 4)) overlong_penalty_factor=1.0 @@ -64,33 +64,27 @@ actor_ppo_max_token_len=$(((max_prompt_length + max_response_length) * 2)) infer_ppo_max_token_len=$(((max_prompt_length + max_response_length) * 3)) ref_offload=True actor_offload=False -gen_tp=1 -sp_size=1 +gen_tp=4 +sp_size=4 fsdp_size=2 # Fully async specific parameters -NNODES=${NNODES:-2} +NNODES_ROLLOUT=${NNODES_ROLLOUT:-3} +NNODES_TRAIN=${NNODES_TRAIN:-1} NGPUS_PER_NODE=${NGPUS_PER_NODE:-8} -n_gpus_rollout=6 -n_gpus_training=$((NGPUS_PER_NODE - n_gpus_rollout)) train_prompt_bsz=0 gen_prompt_bsz=1 n_resp_per_prompt=16 train_prompt_mini_bsz=32 -total_rollout_steps=$(((512*100))) -test_freq=10 +total_rollout_steps=$(((512*400))) +test_freq=20 staleness_threshold=1 -trigger_parameter_sync_step=64 +trigger_parameter_sync_step=32 partial_rollout=True -PYTHON_INTERPRETER="/home/hadoop-djst-algoplat/miniconda3/bin/python" -if [ ! -x "$PYTHON_INTERPRETER" ]; then - PYTHON_INTERPRETER="python3" -fi - -$PYTHON_INTERPRETER -m recipe.fully_async_policy.fully_async_main \ +python -m recipe.fully_async_policy.fully_async_main \ data.train_files="${TRAIN_FILE}" \ data.val_files="${TEST_FILE}" \ data.prompt_key=prompt \ @@ -162,10 +156,10 @@ $PYTHON_INTERPRETER -m recipe.fully_async_policy.fully_async_main \ trainer.save_freq=-1 \ trainer.default_local_dir="${CKPTS_DIR}" \ trainer.resume_mode=auto \ - trainer.nnodes="${NNODES}" \ - trainer.n_gpus_per_node="${n_gpus_training}" \ - rollout.nnodes="${NNODES}" \ - rollout.n_gpus_per_node="${n_gpus_rollout}" \ + trainer.nnodes="${NNODES_TRAIN}" \ + trainer.n_gpus_per_node="${NGPUS_PER_NODE}" \ + rollout.nnodes="${NNODES_ROLLOUT}" \ + rollout.n_gpus_per_node="${NGPUS_PER_NODE}" \ rollout.total_rollout_steps="${total_rollout_steps}" \ rollout.total_epochs=10 \ rollout.test_freq="${test_freq}" \ diff --git a/recipe/fully_async_policy/exp/qwen2-7B-math_32/fsdp2_fully-async_24-8/runtime_env.yaml b/recipe/fully_async_policy/exp/qwen2-7B-math_32/fsdp2_fully-async_24-8/runtime_env.yaml index dcca08e67f7..7402c1b37b0 100644 --- a/recipe/fully_async_policy/exp/qwen2-7B-math_32/fsdp2_fully-async_24-8/runtime_env.yaml +++ b/recipe/fully_async_policy/exp/qwen2-7B-math_32/fsdp2_fully-async_24-8/runtime_env.yaml @@ -1,5 +1,5 @@ env_vars: VLLM_USE_V1: "1" - TENSORBOARD_DIR: "/mnt/dolphinfs/hdd_pool/docker/user/hadoop-djst-algoplat/houzhenggang/tensorboard/dapo_7b_math_fsdp2_4_12" + TENSORBOARD_DIR: "/mnt/dolphinfs/ssd_pool/docker/user/hadoop-friday-studio/FTI/houzhenggang/expeirments/qwen2-7B-math/dapo_qwen2-7B-math_28k_fsdp2_fully-async_24-8_mbs32_tpf32" NCCL_DEBUG: "INFO" HYDRA_FULL_ERROR: "1" \ No newline at end of file diff --git a/recipe/fully_async_policy/exp/qwen2-7B-math_64/fsdp2_fully-async_16-16/dapo_qwen2-7B-math_28k_fsdp2_fully-async_8-24_mbs32_tpf_tfq20.sh b/recipe/fully_async_policy/exp/qwen2-7B-math_32/fsdp2_fully-async_8-24/dapo_qwen2-7B-math_28k_fsdp2_fully-async_8-24_mbs32_tfq11.sh similarity index 91% rename from recipe/fully_async_policy/exp/qwen2-7B-math_64/fsdp2_fully-async_16-16/dapo_qwen2-7B-math_28k_fsdp2_fully-async_8-24_mbs32_tpf_tfq20.sh rename to recipe/fully_async_policy/exp/qwen2-7B-math_32/fsdp2_fully-async_8-24/dapo_qwen2-7B-math_28k_fsdp2_fully-async_8-24_mbs32_tfq11.sh index 618497c0257..9add4e0e8bb 100644 --- a/recipe/fully_async_policy/exp/qwen2-7B-math_64/fsdp2_fully-async_16-16/dapo_qwen2-7B-math_28k_fsdp2_fully-async_8-24_mbs32_tpf_tfq20.sh +++ b/recipe/fully_async_policy/exp/qwen2-7B-math_32/fsdp2_fully-async_8-24/dapo_qwen2-7B-math_28k_fsdp2_fully-async_8-24_mbs32_tfq11.sh @@ -1,8 +1,8 @@ #!/usr/bin/env bash set -xeuo pipefail -# dapo_qwen2-7B-math_28k_fsdp2_fully-async_8-24_mbs32_tpf_tfq20 + project_name='DAPO' -exp_name='dapo_qwen2-7B-math_28k_fsdp2_fully-async_8-24_mbs32_tpf_tfq20' +exp_name='dapo_qwen2-7B-math_28k_fsdp2_fully-async_8-24_mbs32_tpf11' # Ray # RAY_ADDRESS=${RAY_ADDRESS:-"http://localhost:8265"} @@ -44,7 +44,7 @@ clip_ratio_high=0.28 # Response length parameters max_prompt_length=$((1024 * 2)) -max_response_length=$((1024 *8)) +max_response_length=$((1024 * 28)) enable_overlong_buffer=True overlong_buffer_len=$((1024 * 4)) overlong_penalty_factor=1.0 @@ -64,33 +64,27 @@ actor_ppo_max_token_len=$(((max_prompt_length + max_response_length) * 2)) infer_ppo_max_token_len=$(((max_prompt_length + max_response_length) * 3)) ref_offload=True actor_offload=False -gen_tp=1 -sp_size=1 +gen_tp=4 +sp_size=4 fsdp_size=2 # Fully async specific parameters -NNODES=${NNODES:-2} +NNODES_ROLLOUT=${NNODES_ROLLOUT:-1} +NNODES_TRAIN=${NNODES_TRAIN:-3} NGPUS_PER_NODE=${NGPUS_PER_NODE:-8} -n_gpus_rollout=6 -n_gpus_training=$((NGPUS_PER_NODE - n_gpus_rollout)) train_prompt_bsz=0 gen_prompt_bsz=1 n_resp_per_prompt=16 train_prompt_mini_bsz=32 -total_rollout_steps=$(((512*100))) -test_freq=10 +total_rollout_steps=$(((512*400))) +test_freq=20 staleness_threshold=1 -trigger_parameter_sync_step=64 +trigger_parameter_sync_step=11 partial_rollout=True -PYTHON_INTERPRETER="/home/hadoop-djst-algoplat/miniconda3/bin/python" -if [ ! -x "$PYTHON_INTERPRETER" ]; then - PYTHON_INTERPRETER="python3" -fi - -$PYTHON_INTERPRETER -m recipe.fully_async_policy.fully_async_main \ +python -m recipe.fully_async_policy.fully_async_main \ data.train_files="${TRAIN_FILE}" \ data.val_files="${TEST_FILE}" \ data.prompt_key=prompt \ @@ -162,10 +156,10 @@ $PYTHON_INTERPRETER -m recipe.fully_async_policy.fully_async_main \ trainer.save_freq=-1 \ trainer.default_local_dir="${CKPTS_DIR}" \ trainer.resume_mode=auto \ - trainer.nnodes="${NNODES}" \ - trainer.n_gpus_per_node="${n_gpus_training}" \ - rollout.nnodes="${NNODES}" \ - rollout.n_gpus_per_node="${n_gpus_rollout}" \ + trainer.nnodes="${NNODES_TRAIN}" \ + trainer.n_gpus_per_node="${NGPUS_PER_NODE}" \ + rollout.nnodes="${NNODES_ROLLOUT}" \ + rollout.n_gpus_per_node="${NGPUS_PER_NODE}" \ rollout.total_rollout_steps="${total_rollout_steps}" \ rollout.total_epochs=10 \ rollout.test_freq="${test_freq}" \ diff --git a/recipe/fully_async_policy/exp/qwen2-7B-math_32/fsdp2_fully-async_8-24/runtime_env.yaml b/recipe/fully_async_policy/exp/qwen2-7B-math_32/fsdp2_fully-async_8-24/runtime_env.yaml index dcca08e67f7..fc404cfd985 100644 --- a/recipe/fully_async_policy/exp/qwen2-7B-math_32/fsdp2_fully-async_8-24/runtime_env.yaml +++ b/recipe/fully_async_policy/exp/qwen2-7B-math_32/fsdp2_fully-async_8-24/runtime_env.yaml @@ -1,5 +1,5 @@ env_vars: VLLM_USE_V1: "1" - TENSORBOARD_DIR: "/mnt/dolphinfs/hdd_pool/docker/user/hadoop-djst-algoplat/houzhenggang/tensorboard/dapo_7b_math_fsdp2_4_12" + TENSORBOARD_DIR: "/mnt/dolphinfs/ssd_pool/docker/user/hadoop-friday-studio/FTI/houzhenggang/expeirments/qwen2-7B-math/dapo_qwen2-7B-math_28k_fsdp2_fully-async_8-24_mbs32_tpf11" NCCL_DEBUG: "INFO" HYDRA_FULL_ERROR: "1" \ No newline at end of file diff --git a/recipe/fully_async_policy/exp/qwen2-7B-math_64/fsdp2_fully-async_16-16/runtime_env.yaml b/recipe/fully_async_policy/exp/qwen2-7B-math_64/fsdp2_fully-async_16-16/runtime_env.yaml deleted file mode 100644 index dcca08e67f7..00000000000 --- a/recipe/fully_async_policy/exp/qwen2-7B-math_64/fsdp2_fully-async_16-16/runtime_env.yaml +++ /dev/null @@ -1,5 +0,0 @@ -env_vars: - VLLM_USE_V1: "1" - TENSORBOARD_DIR: "/mnt/dolphinfs/hdd_pool/docker/user/hadoop-djst-algoplat/houzhenggang/tensorboard/dapo_7b_math_fsdp2_4_12" - NCCL_DEBUG: "INFO" - HYDRA_FULL_ERROR: "1" \ No newline at end of file diff --git a/recipe/fully_async_policy/exp/qwen2-7B-math_32/fsdp2_fully-async_24-8/dapo_qwen2-7B-math_28k_fsdp2_fully-async_8-24_mbs32_tpf_tfq20.sh b/recipe/fully_async_policy/exp/qwen2-7B-math_64/fsdp2_fully-async_24-40/dapo_qwen2-7B-math_28k_fsdp2_fully-async_24-40_mbs32_tfq6.sh similarity index 90% rename from recipe/fully_async_policy/exp/qwen2-7B-math_32/fsdp2_fully-async_24-8/dapo_qwen2-7B-math_28k_fsdp2_fully-async_8-24_mbs32_tpf_tfq20.sh rename to recipe/fully_async_policy/exp/qwen2-7B-math_64/fsdp2_fully-async_24-40/dapo_qwen2-7B-math_28k_fsdp2_fully-async_24-40_mbs32_tfq6.sh index 618497c0257..5da2116ef80 100644 --- a/recipe/fully_async_policy/exp/qwen2-7B-math_32/fsdp2_fully-async_24-8/dapo_qwen2-7B-math_28k_fsdp2_fully-async_8-24_mbs32_tpf_tfq20.sh +++ b/recipe/fully_async_policy/exp/qwen2-7B-math_64/fsdp2_fully-async_24-40/dapo_qwen2-7B-math_28k_fsdp2_fully-async_24-40_mbs32_tfq6.sh @@ -1,8 +1,8 @@ #!/usr/bin/env bash set -xeuo pipefail -# dapo_qwen2-7B-math_28k_fsdp2_fully-async_8-24_mbs32_tpf_tfq20 + project_name='DAPO' -exp_name='dapo_qwen2-7B-math_28k_fsdp2_fully-async_8-24_mbs32_tpf_tfq20' +exp_name='dapo_qwen2-7B-math_28k_fsdp2_fully-async_24-40_mbs32_tpf6' # Ray # RAY_ADDRESS=${RAY_ADDRESS:-"http://localhost:8265"} @@ -44,7 +44,7 @@ clip_ratio_high=0.28 # Response length parameters max_prompt_length=$((1024 * 2)) -max_response_length=$((1024 *8)) +max_response_length=$((1024 * 28)) enable_overlong_buffer=True overlong_buffer_len=$((1024 * 4)) overlong_penalty_factor=1.0 @@ -64,33 +64,27 @@ actor_ppo_max_token_len=$(((max_prompt_length + max_response_length) * 2)) infer_ppo_max_token_len=$(((max_prompt_length + max_response_length) * 3)) ref_offload=True actor_offload=False -gen_tp=1 -sp_size=1 +gen_tp=4 +sp_size=4 fsdp_size=2 # Fully async specific parameters -NNODES=${NNODES:-2} +NNODES_ROLLOUT=${NNODES_ROLLOUT:-3} +NNODES_TRAIN=${NNODES_TRAIN:-5} NGPUS_PER_NODE=${NGPUS_PER_NODE:-8} -n_gpus_rollout=6 -n_gpus_training=$((NGPUS_PER_NODE - n_gpus_rollout)) train_prompt_bsz=0 gen_prompt_bsz=1 n_resp_per_prompt=16 train_prompt_mini_bsz=32 -total_rollout_steps=$(((512*100))) -test_freq=10 +total_rollout_steps=$(((512*400))) +test_freq=20 staleness_threshold=1 -trigger_parameter_sync_step=64 +trigger_parameter_sync_step=6 partial_rollout=True -PYTHON_INTERPRETER="/home/hadoop-djst-algoplat/miniconda3/bin/python" -if [ ! -x "$PYTHON_INTERPRETER" ]; then - PYTHON_INTERPRETER="python3" -fi - -$PYTHON_INTERPRETER -m recipe.fully_async_policy.fully_async_main \ +python -m recipe.fully_async_policy.fully_async_main \ data.train_files="${TRAIN_FILE}" \ data.val_files="${TEST_FILE}" \ data.prompt_key=prompt \ @@ -162,10 +156,10 @@ $PYTHON_INTERPRETER -m recipe.fully_async_policy.fully_async_main \ trainer.save_freq=-1 \ trainer.default_local_dir="${CKPTS_DIR}" \ trainer.resume_mode=auto \ - trainer.nnodes="${NNODES}" \ - trainer.n_gpus_per_node="${n_gpus_training}" \ - rollout.nnodes="${NNODES}" \ - rollout.n_gpus_per_node="${n_gpus_rollout}" \ + trainer.nnodes="${NNODES_TRAIN}" \ + trainer.n_gpus_per_node="${NGPUS_PER_NODE}" \ + rollout.nnodes="${NNODES_ROLLOUT}" \ + rollout.n_gpus_per_node="${NGPUS_PER_NODE}" \ rollout.total_rollout_steps="${total_rollout_steps}" \ rollout.total_epochs=10 \ rollout.test_freq="${test_freq}" \ diff --git a/recipe/fully_async_policy/exp/qwen2-7B-math_64/fsdp2_fully-async_24-40/runtime_env.yaml b/recipe/fully_async_policy/exp/qwen2-7B-math_64/fsdp2_fully-async_24-40/runtime_env.yaml new file mode 100644 index 00000000000..ef67409ba6f --- /dev/null +++ b/recipe/fully_async_policy/exp/qwen2-7B-math_64/fsdp2_fully-async_24-40/runtime_env.yaml @@ -0,0 +1,5 @@ +env_vars: + VLLM_USE_V1: "1" + TENSORBOARD_DIR: "/mnt/dolphinfs/ssd_pool/docker/user/hadoop-friday-studio/FTI/houzhenggang/expeirments/qwen2-7B-math-64/dapo_qwen2-7B-math_28k_fsdp2_fully-async_24-40_mbs32_tpf6" + NCCL_DEBUG: "INFO" + HYDRA_FULL_ERROR: "1" \ No newline at end of file diff --git a/recipe/fully_async_policy/exp/qwen2-7B-math_64/fsdp2_fully-async_24-8/dapo_qwen2-7B-math_28k_fsdp2_fully-async_8-24_mbs32_tpf_tfq20.sh b/recipe/fully_async_policy/exp/qwen2-7B-math_64/fsdp2_fully-async_24-8/dapo_qwen2-7B-math_28k_fsdp2_fully-async_8-24_mbs32_tpf_tfq20.sh deleted file mode 100644 index 618497c0257..00000000000 --- a/recipe/fully_async_policy/exp/qwen2-7B-math_64/fsdp2_fully-async_24-8/dapo_qwen2-7B-math_28k_fsdp2_fully-async_8-24_mbs32_tpf_tfq20.sh +++ /dev/null @@ -1,174 +0,0 @@ -#!/usr/bin/env bash -set -xeuo pipefail -# dapo_qwen2-7B-math_28k_fsdp2_fully-async_8-24_mbs32_tpf_tfq20 -project_name='DAPO' -exp_name='dapo_qwen2-7B-math_28k_fsdp2_fully-async_8-24_mbs32_tpf_tfq20' - -# Ray -# RAY_ADDRESS=${RAY_ADDRESS:-"http://localhost:8265"} -# WORKING_DIR=${WORKING_DIR:-"${PWD}"} -# RUNTIME_ENV=${RUNTIME_ENV:-"${WORKING_DIR}/verl/trainer/runtime_env.yaml"} -# Paths -RAY_DATA_HOME=${RAY_DATA_HOME:-"${HOME}/verl"} -# very important! please modify the max_position_embeddings in config.json to 32768 after downloading from huggingface -MODEL_PATH=${MODEL_PATH:-"${RAY_DATA_HOME}/models/Qwen2.5-Math-7B"} -CKPTS_DIR=${CKPTS_DIR:-"${RAY_DATA_HOME}/ckpts/${project_name}/${exp_name}"} -TRAIN_FILE=${TRAIN_FILE:-"${RAY_DATA_HOME}/data/dapo-math-17k.parquet"} -TEST_FILE=${TEST_FILE:-"${RAY_DATA_HOME}/data/aime-2024.parquet"} - -# MODEL_PATH=/mnt/dolphinfs/hdd_pool/docker/user/hadoop-djst-algoplat/houzhenggang/modelscope/hub/models/Qwen/Qwen2___5-Math-7B -MODEL_PATH=/mnt/dolphinfs/ssd_pool/docker/user/hadoop-friday-studio/FTI/houzhenggang/model/Qwen2___5-Math-7B -CKPTS_DIR=./ckpts/${project_name}/${exp_name} -# TRAIN_FILE=/mnt/dolphinfs/hdd_pool/docker/user/hadoop-djst-algoplat/houzhenggang/data/dapo/dapo-math-17k.parquet -# TEST_FILE=/mnt/dolphinfs/hdd_pool/docker/user/hadoop-djst-algoplat/houzhenggang/data/dapo/aime-2024.parquet -TRAIN_FILE=/mnt/dolphinfs/ssd_pool/docker/user/hadoop-friday-studio/FTI/houzhenggang/data/dapo/dapo-math-17k.parquet -TEST_FILE=/mnt/dolphinfs/ssd_pool/docker/user/hadoop-friday-studio/FTI/houzhenggang/data/dapo/aime-2024.parquet - -rollout_mode="async" -rollout_name="vllm" # sglang or vllm -if [ "$rollout_mode" = "async" ]; then - export VLLM_USE_V1=1 - return_raw_chat="True" -fi - -# Algorithm parameters -adv_estimator=grpo - -use_kl_in_reward=False -kl_coef=0.0 -use_kl_loss=False -kl_loss_coef=0.0 - -clip_ratio_low=0.2 -clip_ratio_high=0.28 - -# Response length parameters -max_prompt_length=$((1024 * 2)) -max_response_length=$((1024 *8)) -enable_overlong_buffer=True -overlong_buffer_len=$((1024 * 4)) -overlong_penalty_factor=1.0 - -# Training parameters -loss_agg_mode="token-mean" - -# Algorithm -temperature=1.0 -top_p=1.0 -top_k=-1 # 0 for HF rollout, -1 for vLLM rollout -val_top_p=0.7 - -# Performance Related Parameter -use_dynamic_bsz=True -actor_ppo_max_token_len=$(((max_prompt_length + max_response_length) * 2)) -infer_ppo_max_token_len=$(((max_prompt_length + max_response_length) * 3)) -ref_offload=True -actor_offload=False -gen_tp=1 -sp_size=1 -fsdp_size=2 - -# Fully async specific parameters -NNODES=${NNODES:-2} -NGPUS_PER_NODE=${NGPUS_PER_NODE:-8} - -n_gpus_rollout=6 -n_gpus_training=$((NGPUS_PER_NODE - n_gpus_rollout)) - -train_prompt_bsz=0 -gen_prompt_bsz=1 -n_resp_per_prompt=16 -train_prompt_mini_bsz=32 -total_rollout_steps=$(((512*100))) -test_freq=10 -staleness_threshold=1 -trigger_parameter_sync_step=64 -partial_rollout=True - -PYTHON_INTERPRETER="/home/hadoop-djst-algoplat/miniconda3/bin/python" -if [ ! -x "$PYTHON_INTERPRETER" ]; then - PYTHON_INTERPRETER="python3" -fi - -$PYTHON_INTERPRETER -m recipe.fully_async_policy.fully_async_main \ - data.train_files="${TRAIN_FILE}" \ - data.val_files="${TEST_FILE}" \ - data.prompt_key=prompt \ - data.truncation='left' \ - data.max_prompt_length=${max_prompt_length} \ - data.max_response_length=${max_response_length} \ - data.train_batch_size=${train_prompt_bsz} \ - data.gen_batch_size=${gen_prompt_bsz} \ - data.return_raw_chat=${return_raw_chat} \ - actor_rollout_ref.rollout.n=${n_resp_per_prompt} \ - algorithm.adv_estimator=${adv_estimator} \ - algorithm.use_kl_in_reward=${use_kl_in_reward} \ - algorithm.kl_ctrl.kl_coef=${kl_coef} \ - actor_rollout_ref.actor.strategy=fsdp2 \ - critic.strategy=fsdp2 \ - actor_rollout_ref.actor.use_kl_loss=${use_kl_loss} \ - actor_rollout_ref.actor.kl_loss_coef=${kl_loss_coef} \ - actor_rollout_ref.actor.clip_ratio_low=${clip_ratio_low} \ - actor_rollout_ref.actor.clip_ratio_high=${clip_ratio_high} \ - actor_rollout_ref.actor.clip_ratio_c=10.0 \ - actor_rollout_ref.model.use_remove_padding=True \ - actor_rollout_ref.hybrid_engine=False \ - +actor_rollout_ref.model.override_config.max_position_embeddings=32768 \ - actor_rollout_ref.actor.use_dynamic_bsz=${use_dynamic_bsz} \ - actor_rollout_ref.ref.log_prob_use_dynamic_bsz=${use_dynamic_bsz} \ - actor_rollout_ref.rollout.log_prob_use_dynamic_bsz=${use_dynamic_bsz} \ - actor_rollout_ref.actor.ppo_max_token_len_per_gpu=${actor_ppo_max_token_len} \ - actor_rollout_ref.ref.log_prob_max_token_len_per_gpu=${infer_ppo_max_token_len} \ - actor_rollout_ref.rollout.log_prob_max_token_len_per_gpu=${infer_ppo_max_token_len} \ - actor_rollout_ref.model.path="${MODEL_PATH}" \ - actor_rollout_ref.actor.optim.lr=1e-6 \ - actor_rollout_ref.actor.optim.lr_warmup_steps=10 \ - actor_rollout_ref.actor.optim.weight_decay=0.1 \ - actor_rollout_ref.actor.ppo_mini_batch_size=${train_prompt_mini_bsz} \ - actor_rollout_ref.actor.fsdp_config.param_offload=${actor_offload} \ - actor_rollout_ref.actor.fsdp_config.optimizer_offload=${actor_offload} \ - actor_rollout_ref.actor.entropy_coeff=0 \ - actor_rollout_ref.actor.grad_clip=1.0 \ - actor_rollout_ref.actor.loss_agg_mode=${loss_agg_mode} \ - actor_rollout_ref.actor.ulysses_sequence_parallel_size=${sp_size} \ - actor_rollout_ref.rollout.gpu_memory_utilization=0.80 \ - actor_rollout_ref.rollout.tensor_model_parallel_size=${gen_tp} \ - actor_rollout_ref.rollout.enable_chunked_prefill=True \ - actor_rollout_ref.rollout.max_num_batched_tokens=$((max_prompt_length + max_response_length)) \ - actor_rollout_ref.rollout.temperature=${temperature} \ - actor_rollout_ref.rollout.top_p=${top_p} \ - actor_rollout_ref.rollout.top_k=${top_k} \ - actor_rollout_ref.rollout.val_kwargs.temperature=${temperature} \ - actor_rollout_ref.rollout.val_kwargs.top_p=${val_top_p} \ - actor_rollout_ref.rollout.val_kwargs.top_k=${top_k} \ - actor_rollout_ref.rollout.val_kwargs.do_sample=True \ - actor_rollout_ref.rollout.val_kwargs.n=1 \ - actor_rollout_ref.ref.fsdp_config.param_offload=${ref_offload} \ - actor_rollout_ref.ref.ulysses_sequence_parallel_size=${sp_size} \ - actor_rollout_ref.actor.fsdp_config.fsdp_size=${fsdp_size} \ - actor_rollout_ref.rollout.name=${rollout_name} \ - actor_rollout_ref.rollout.mode=${rollout_mode} \ - actor_rollout_ref.rollout.calculate_log_probs=True \ - reward_model.reward_manager=dapo \ - +reward_model.reward_kwargs.overlong_buffer_cfg.enable=${enable_overlong_buffer} \ - +reward_model.reward_kwargs.overlong_buffer_cfg.len=${overlong_buffer_len} \ - +reward_model.reward_kwargs.overlong_buffer_cfg.penalty_factor=${overlong_penalty_factor} \ - +reward_model.reward_kwargs.overlong_buffer_cfg.log=False \ - +reward_model.reward_kwargs.max_resp_len=${max_response_length} \ - trainer.logger=['console','tensorboard'] \ - trainer.project_name="${project_name}" \ - trainer.experiment_name="${exp_name}" \ - trainer.val_before_train=True \ - trainer.save_freq=-1 \ - trainer.default_local_dir="${CKPTS_DIR}" \ - trainer.resume_mode=auto \ - trainer.nnodes="${NNODES}" \ - trainer.n_gpus_per_node="${n_gpus_training}" \ - rollout.nnodes="${NNODES}" \ - rollout.n_gpus_per_node="${n_gpus_rollout}" \ - rollout.total_rollout_steps="${total_rollout_steps}" \ - rollout.total_epochs=10 \ - rollout.test_freq="${test_freq}" \ - async_training.staleness_threshold="${staleness_threshold}" \ - async_training.trigger_parameter_sync_step="${trigger_parameter_sync_step}" \ - async_training.partial_rollout="${partial_rollout}" diff --git a/recipe/fully_async_policy/exp/qwen2-7B-math_64/fsdp2_fully-async_24-8/runtime_env.yaml b/recipe/fully_async_policy/exp/qwen2-7B-math_64/fsdp2_fully-async_24-8/runtime_env.yaml deleted file mode 100644 index dcca08e67f7..00000000000 --- a/recipe/fully_async_policy/exp/qwen2-7B-math_64/fsdp2_fully-async_24-8/runtime_env.yaml +++ /dev/null @@ -1,5 +0,0 @@ -env_vars: - VLLM_USE_V1: "1" - TENSORBOARD_DIR: "/mnt/dolphinfs/hdd_pool/docker/user/hadoop-djst-algoplat/houzhenggang/tensorboard/dapo_7b_math_fsdp2_4_12" - NCCL_DEBUG: "INFO" - HYDRA_FULL_ERROR: "1" \ No newline at end of file diff --git a/recipe/fully_async_policy/exp/qwen2-7B-math_64/fsdp2_fully-async_32-32/dapo_qwen2-7B-math_28k_fsdp2_fully-async_32-32_mbs32_tfq8.sh b/recipe/fully_async_policy/exp/qwen2-7B-math_64/fsdp2_fully-async_32-32/dapo_qwen2-7B-math_28k_fsdp2_fully-async_32-32_mbs32_tfq8.sh new file mode 100644 index 00000000000..c31c59df4db --- /dev/null +++ b/recipe/fully_async_policy/exp/qwen2-7B-math_64/fsdp2_fully-async_32-32/dapo_qwen2-7B-math_28k_fsdp2_fully-async_32-32_mbs32_tfq8.sh @@ -0,0 +1,168 @@ +#!/usr/bin/env bash +set -xeuo pipefail + +project_name='DAPO' +exp_name='dapo_qwen2-7B-math_28k_fsdp2_fully-async_32-32_mbs32_tpf8' + +# Ray +# RAY_ADDRESS=${RAY_ADDRESS:-"http://localhost:8265"} +# WORKING_DIR=${WORKING_DIR:-"${PWD}"} +# RUNTIME_ENV=${RUNTIME_ENV:-"${WORKING_DIR}/verl/trainer/runtime_env.yaml"} +# Paths +RAY_DATA_HOME=${RAY_DATA_HOME:-"${HOME}/verl"} +# very important! please modify the max_position_embeddings in config.json to 32768 after downloading from huggingface +MODEL_PATH=${MODEL_PATH:-"${RAY_DATA_HOME}/models/Qwen2.5-Math-7B"} +CKPTS_DIR=${CKPTS_DIR:-"${RAY_DATA_HOME}/ckpts/${project_name}/${exp_name}"} +TRAIN_FILE=${TRAIN_FILE:-"${RAY_DATA_HOME}/data/dapo-math-17k.parquet"} +TEST_FILE=${TEST_FILE:-"${RAY_DATA_HOME}/data/aime-2024.parquet"} + +# MODEL_PATH=/mnt/dolphinfs/hdd_pool/docker/user/hadoop-djst-algoplat/houzhenggang/modelscope/hub/models/Qwen/Qwen2___5-Math-7B +MODEL_PATH=/mnt/dolphinfs/ssd_pool/docker/user/hadoop-friday-studio/FTI/houzhenggang/model/Qwen2___5-Math-7B +CKPTS_DIR=./ckpts/${project_name}/${exp_name} +# TRAIN_FILE=/mnt/dolphinfs/hdd_pool/docker/user/hadoop-djst-algoplat/houzhenggang/data/dapo/dapo-math-17k.parquet +# TEST_FILE=/mnt/dolphinfs/hdd_pool/docker/user/hadoop-djst-algoplat/houzhenggang/data/dapo/aime-2024.parquet +TRAIN_FILE=/mnt/dolphinfs/ssd_pool/docker/user/hadoop-friday-studio/FTI/houzhenggang/data/dapo/dapo-math-17k.parquet +TEST_FILE=/mnt/dolphinfs/ssd_pool/docker/user/hadoop-friday-studio/FTI/houzhenggang/data/dapo/aime-2024.parquet + +rollout_mode="async" +rollout_name="vllm" # sglang or vllm +if [ "$rollout_mode" = "async" ]; then + export VLLM_USE_V1=1 + return_raw_chat="True" +fi + +# Algorithm parameters +adv_estimator=grpo + +use_kl_in_reward=False +kl_coef=0.0 +use_kl_loss=False +kl_loss_coef=0.0 + +clip_ratio_low=0.2 +clip_ratio_high=0.28 + +# Response length parameters +max_prompt_length=$((1024 * 2)) +max_response_length=$((1024 * 28)) +enable_overlong_buffer=True +overlong_buffer_len=$((1024 * 4)) +overlong_penalty_factor=1.0 + +# Training parameters +loss_agg_mode="token-mean" + +# Algorithm +temperature=1.0 +top_p=1.0 +top_k=-1 # 0 for HF rollout, -1 for vLLM rollout +val_top_p=0.7 + +# Performance Related Parameter +use_dynamic_bsz=True +actor_ppo_max_token_len=$(((max_prompt_length + max_response_length) * 2)) +infer_ppo_max_token_len=$(((max_prompt_length + max_response_length) * 3)) +ref_offload=True +actor_offload=False +gen_tp=4 +sp_size=4 +fsdp_size=2 + +# Fully async specific parameters +NNODES_ROLLOUT=${NNODES_ROLLOUT:-4} +NNODES_TRAIN=${NNODES_TRAIN:-4} +NGPUS_PER_NODE=${NGPUS_PER_NODE:-8} + + +train_prompt_bsz=0 +gen_prompt_bsz=1 +n_resp_per_prompt=16 +train_prompt_mini_bsz=32 +total_rollout_steps=$(((512*400))) +test_freq=20 +staleness_threshold=1 +trigger_parameter_sync_step=8 +partial_rollout=True + +python -m recipe.fully_async_policy.fully_async_main \ + data.train_files="${TRAIN_FILE}" \ + data.val_files="${TEST_FILE}" \ + data.prompt_key=prompt \ + data.truncation='left' \ + data.max_prompt_length=${max_prompt_length} \ + data.max_response_length=${max_response_length} \ + data.train_batch_size=${train_prompt_bsz} \ + data.gen_batch_size=${gen_prompt_bsz} \ + data.return_raw_chat=${return_raw_chat} \ + actor_rollout_ref.rollout.n=${n_resp_per_prompt} \ + algorithm.adv_estimator=${adv_estimator} \ + algorithm.use_kl_in_reward=${use_kl_in_reward} \ + algorithm.kl_ctrl.kl_coef=${kl_coef} \ + actor_rollout_ref.actor.strategy=fsdp2 \ + critic.strategy=fsdp2 \ + actor_rollout_ref.actor.use_kl_loss=${use_kl_loss} \ + actor_rollout_ref.actor.kl_loss_coef=${kl_loss_coef} \ + actor_rollout_ref.actor.clip_ratio_low=${clip_ratio_low} \ + actor_rollout_ref.actor.clip_ratio_high=${clip_ratio_high} \ + actor_rollout_ref.actor.clip_ratio_c=10.0 \ + actor_rollout_ref.model.use_remove_padding=True \ + actor_rollout_ref.hybrid_engine=False \ + +actor_rollout_ref.model.override_config.max_position_embeddings=32768 \ + actor_rollout_ref.actor.use_dynamic_bsz=${use_dynamic_bsz} \ + actor_rollout_ref.ref.log_prob_use_dynamic_bsz=${use_dynamic_bsz} \ + actor_rollout_ref.rollout.log_prob_use_dynamic_bsz=${use_dynamic_bsz} \ + actor_rollout_ref.actor.ppo_max_token_len_per_gpu=${actor_ppo_max_token_len} \ + actor_rollout_ref.ref.log_prob_max_token_len_per_gpu=${infer_ppo_max_token_len} \ + actor_rollout_ref.rollout.log_prob_max_token_len_per_gpu=${infer_ppo_max_token_len} \ + actor_rollout_ref.model.path="${MODEL_PATH}" \ + actor_rollout_ref.actor.optim.lr=1e-6 \ + actor_rollout_ref.actor.optim.lr_warmup_steps=10 \ + actor_rollout_ref.actor.optim.weight_decay=0.1 \ + actor_rollout_ref.actor.ppo_mini_batch_size=${train_prompt_mini_bsz} \ + actor_rollout_ref.actor.fsdp_config.param_offload=${actor_offload} \ + actor_rollout_ref.actor.fsdp_config.optimizer_offload=${actor_offload} \ + actor_rollout_ref.actor.entropy_coeff=0 \ + actor_rollout_ref.actor.grad_clip=1.0 \ + actor_rollout_ref.actor.loss_agg_mode=${loss_agg_mode} \ + actor_rollout_ref.actor.ulysses_sequence_parallel_size=${sp_size} \ + actor_rollout_ref.rollout.gpu_memory_utilization=0.80 \ + actor_rollout_ref.rollout.tensor_model_parallel_size=${gen_tp} \ + actor_rollout_ref.rollout.enable_chunked_prefill=True \ + actor_rollout_ref.rollout.max_num_batched_tokens=$((max_prompt_length + max_response_length)) \ + actor_rollout_ref.rollout.temperature=${temperature} \ + actor_rollout_ref.rollout.top_p=${top_p} \ + actor_rollout_ref.rollout.top_k=${top_k} \ + actor_rollout_ref.rollout.val_kwargs.temperature=${temperature} \ + actor_rollout_ref.rollout.val_kwargs.top_p=${val_top_p} \ + actor_rollout_ref.rollout.val_kwargs.top_k=${top_k} \ + actor_rollout_ref.rollout.val_kwargs.do_sample=True \ + actor_rollout_ref.rollout.val_kwargs.n=1 \ + actor_rollout_ref.ref.fsdp_config.param_offload=${ref_offload} \ + actor_rollout_ref.ref.ulysses_sequence_parallel_size=${sp_size} \ + actor_rollout_ref.actor.fsdp_config.fsdp_size=${fsdp_size} \ + actor_rollout_ref.rollout.name=${rollout_name} \ + actor_rollout_ref.rollout.mode=${rollout_mode} \ + actor_rollout_ref.rollout.calculate_log_probs=True \ + reward_model.reward_manager=dapo \ + +reward_model.reward_kwargs.overlong_buffer_cfg.enable=${enable_overlong_buffer} \ + +reward_model.reward_kwargs.overlong_buffer_cfg.len=${overlong_buffer_len} \ + +reward_model.reward_kwargs.overlong_buffer_cfg.penalty_factor=${overlong_penalty_factor} \ + +reward_model.reward_kwargs.overlong_buffer_cfg.log=False \ + +reward_model.reward_kwargs.max_resp_len=${max_response_length} \ + trainer.logger=['console','tensorboard'] \ + trainer.project_name="${project_name}" \ + trainer.experiment_name="${exp_name}" \ + trainer.val_before_train=True \ + trainer.save_freq=-1 \ + trainer.default_local_dir="${CKPTS_DIR}" \ + trainer.resume_mode=auto \ + trainer.nnodes="${NNODES_TRAIN}" \ + trainer.n_gpus_per_node="${NGPUS_PER_NODE}" \ + rollout.nnodes="${NNODES_ROLLOUT}" \ + rollout.n_gpus_per_node="${NGPUS_PER_NODE}" \ + rollout.total_rollout_steps="${total_rollout_steps}" \ + rollout.total_epochs=10 \ + rollout.test_freq="${test_freq}" \ + async_training.staleness_threshold="${staleness_threshold}" \ + async_training.trigger_parameter_sync_step="${trigger_parameter_sync_step}" \ + async_training.partial_rollout="${partial_rollout}" diff --git a/recipe/fully_async_policy/exp/qwen2-7B-math_64/fsdp2_fully-async_32-32/runtime_env.yaml b/recipe/fully_async_policy/exp/qwen2-7B-math_64/fsdp2_fully-async_32-32/runtime_env.yaml new file mode 100644 index 00000000000..20d464776b0 --- /dev/null +++ b/recipe/fully_async_policy/exp/qwen2-7B-math_64/fsdp2_fully-async_32-32/runtime_env.yaml @@ -0,0 +1,5 @@ +env_vars: + VLLM_USE_V1: "1" + TENSORBOARD_DIR: "/mnt/dolphinfs/ssd_pool/docker/user/hadoop-friday-studio/FTI/houzhenggang/expeirments/qwen2-7B-math-64/dapo_qwen2-7B-math_28k_fsdp2_fully-async_32-32_mbs32_tpf8" + NCCL_DEBUG: "INFO" + HYDRA_FULL_ERROR: "1" \ No newline at end of file diff --git a/recipe/fully_async_policy/exp/qwen2-7B-math_64/fsdp2_fully-async_40-24/dapo_qwen2-7B-math_28k_fsdp2_fully-async_40-24_mbs32_tfq11.sh b/recipe/fully_async_policy/exp/qwen2-7B-math_64/fsdp2_fully-async_40-24/dapo_qwen2-7B-math_28k_fsdp2_fully-async_40-24_mbs32_tfq11.sh new file mode 100644 index 00000000000..a15cf990bd1 --- /dev/null +++ b/recipe/fully_async_policy/exp/qwen2-7B-math_64/fsdp2_fully-async_40-24/dapo_qwen2-7B-math_28k_fsdp2_fully-async_40-24_mbs32_tfq11.sh @@ -0,0 +1,168 @@ +#!/usr/bin/env bash +set -xeuo pipefail + +project_name='DAPO' +exp_name='dapo_qwen2-7B-math_28k_fsdp2_fully-async_40-24_mbs32_tpf11' + +# Ray +# RAY_ADDRESS=${RAY_ADDRESS:-"http://localhost:8265"} +# WORKING_DIR=${WORKING_DIR:-"${PWD}"} +# RUNTIME_ENV=${RUNTIME_ENV:-"${WORKING_DIR}/verl/trainer/runtime_env.yaml"} +# Paths +RAY_DATA_HOME=${RAY_DATA_HOME:-"${HOME}/verl"} +# very important! please modify the max_position_embeddings in config.json to 32768 after downloading from huggingface +MODEL_PATH=${MODEL_PATH:-"${RAY_DATA_HOME}/models/Qwen2.5-Math-7B"} +CKPTS_DIR=${CKPTS_DIR:-"${RAY_DATA_HOME}/ckpts/${project_name}/${exp_name}"} +TRAIN_FILE=${TRAIN_FILE:-"${RAY_DATA_HOME}/data/dapo-math-17k.parquet"} +TEST_FILE=${TEST_FILE:-"${RAY_DATA_HOME}/data/aime-2024.parquet"} + +# MODEL_PATH=/mnt/dolphinfs/hdd_pool/docker/user/hadoop-djst-algoplat/houzhenggang/modelscope/hub/models/Qwen/Qwen2___5-Math-7B +MODEL_PATH=/mnt/dolphinfs/ssd_pool/docker/user/hadoop-friday-studio/FTI/houzhenggang/model/Qwen2___5-Math-7B +CKPTS_DIR=./ckpts/${project_name}/${exp_name} +# TRAIN_FILE=/mnt/dolphinfs/hdd_pool/docker/user/hadoop-djst-algoplat/houzhenggang/data/dapo/dapo-math-17k.parquet +# TEST_FILE=/mnt/dolphinfs/hdd_pool/docker/user/hadoop-djst-algoplat/houzhenggang/data/dapo/aime-2024.parquet +TRAIN_FILE=/mnt/dolphinfs/ssd_pool/docker/user/hadoop-friday-studio/FTI/houzhenggang/data/dapo/dapo-math-17k.parquet +TEST_FILE=/mnt/dolphinfs/ssd_pool/docker/user/hadoop-friday-studio/FTI/houzhenggang/data/dapo/aime-2024.parquet + +rollout_mode="async" +rollout_name="vllm" # sglang or vllm +if [ "$rollout_mode" = "async" ]; then + export VLLM_USE_V1=1 + return_raw_chat="True" +fi + +# Algorithm parameters +adv_estimator=grpo + +use_kl_in_reward=False +kl_coef=0.0 +use_kl_loss=False +kl_loss_coef=0.0 + +clip_ratio_low=0.2 +clip_ratio_high=0.28 + +# Response length parameters +max_prompt_length=$((1024 * 2)) +max_response_length=$((1024 * 28)) +enable_overlong_buffer=True +overlong_buffer_len=$((1024 * 4)) +overlong_penalty_factor=1.0 + +# Training parameters +loss_agg_mode="token-mean" + +# Algorithm +temperature=1.0 +top_p=1.0 +top_k=-1 # 0 for HF rollout, -1 for vLLM rollout +val_top_p=0.7 + +# Performance Related Parameter +use_dynamic_bsz=True +actor_ppo_max_token_len=$(((max_prompt_length + max_response_length) * 2)) +infer_ppo_max_token_len=$(((max_prompt_length + max_response_length) * 3)) +ref_offload=True +actor_offload=False +gen_tp=4 +sp_size=4 +fsdp_size=2 + +# Fully async specific parameters +NNODES_ROLLOUT=${NNODES_ROLLOUT:-5} +NNODES_TRAIN=${NNODES_TRAIN:-3} +NGPUS_PER_NODE=${NGPUS_PER_NODE:-8} + + +train_prompt_bsz=0 +gen_prompt_bsz=1 +n_resp_per_prompt=16 +train_prompt_mini_bsz=32 +total_rollout_steps=$(((512*400))) +test_freq=20 +staleness_threshold=1 +trigger_parameter_sync_step=11 +partial_rollout=True + +python -m recipe.fully_async_policy.fully_async_main \ + data.train_files="${TRAIN_FILE}" \ + data.val_files="${TEST_FILE}" \ + data.prompt_key=prompt \ + data.truncation='left' \ + data.max_prompt_length=${max_prompt_length} \ + data.max_response_length=${max_response_length} \ + data.train_batch_size=${train_prompt_bsz} \ + data.gen_batch_size=${gen_prompt_bsz} \ + data.return_raw_chat=${return_raw_chat} \ + actor_rollout_ref.rollout.n=${n_resp_per_prompt} \ + algorithm.adv_estimator=${adv_estimator} \ + algorithm.use_kl_in_reward=${use_kl_in_reward} \ + algorithm.kl_ctrl.kl_coef=${kl_coef} \ + actor_rollout_ref.actor.strategy=fsdp2 \ + critic.strategy=fsdp2 \ + actor_rollout_ref.actor.use_kl_loss=${use_kl_loss} \ + actor_rollout_ref.actor.kl_loss_coef=${kl_loss_coef} \ + actor_rollout_ref.actor.clip_ratio_low=${clip_ratio_low} \ + actor_rollout_ref.actor.clip_ratio_high=${clip_ratio_high} \ + actor_rollout_ref.actor.clip_ratio_c=10.0 \ + actor_rollout_ref.model.use_remove_padding=True \ + actor_rollout_ref.hybrid_engine=False \ + +actor_rollout_ref.model.override_config.max_position_embeddings=32768 \ + actor_rollout_ref.actor.use_dynamic_bsz=${use_dynamic_bsz} \ + actor_rollout_ref.ref.log_prob_use_dynamic_bsz=${use_dynamic_bsz} \ + actor_rollout_ref.rollout.log_prob_use_dynamic_bsz=${use_dynamic_bsz} \ + actor_rollout_ref.actor.ppo_max_token_len_per_gpu=${actor_ppo_max_token_len} \ + actor_rollout_ref.ref.log_prob_max_token_len_per_gpu=${infer_ppo_max_token_len} \ + actor_rollout_ref.rollout.log_prob_max_token_len_per_gpu=${infer_ppo_max_token_len} \ + actor_rollout_ref.model.path="${MODEL_PATH}" \ + actor_rollout_ref.actor.optim.lr=1e-6 \ + actor_rollout_ref.actor.optim.lr_warmup_steps=10 \ + actor_rollout_ref.actor.optim.weight_decay=0.1 \ + actor_rollout_ref.actor.ppo_mini_batch_size=${train_prompt_mini_bsz} \ + actor_rollout_ref.actor.fsdp_config.param_offload=${actor_offload} \ + actor_rollout_ref.actor.fsdp_config.optimizer_offload=${actor_offload} \ + actor_rollout_ref.actor.entropy_coeff=0 \ + actor_rollout_ref.actor.grad_clip=1.0 \ + actor_rollout_ref.actor.loss_agg_mode=${loss_agg_mode} \ + actor_rollout_ref.actor.ulysses_sequence_parallel_size=${sp_size} \ + actor_rollout_ref.rollout.gpu_memory_utilization=0.80 \ + actor_rollout_ref.rollout.tensor_model_parallel_size=${gen_tp} \ + actor_rollout_ref.rollout.enable_chunked_prefill=True \ + actor_rollout_ref.rollout.max_num_batched_tokens=$((max_prompt_length + max_response_length)) \ + actor_rollout_ref.rollout.temperature=${temperature} \ + actor_rollout_ref.rollout.top_p=${top_p} \ + actor_rollout_ref.rollout.top_k=${top_k} \ + actor_rollout_ref.rollout.val_kwargs.temperature=${temperature} \ + actor_rollout_ref.rollout.val_kwargs.top_p=${val_top_p} \ + actor_rollout_ref.rollout.val_kwargs.top_k=${top_k} \ + actor_rollout_ref.rollout.val_kwargs.do_sample=True \ + actor_rollout_ref.rollout.val_kwargs.n=1 \ + actor_rollout_ref.ref.fsdp_config.param_offload=${ref_offload} \ + actor_rollout_ref.ref.ulysses_sequence_parallel_size=${sp_size} \ + actor_rollout_ref.actor.fsdp_config.fsdp_size=${fsdp_size} \ + actor_rollout_ref.rollout.name=${rollout_name} \ + actor_rollout_ref.rollout.mode=${rollout_mode} \ + actor_rollout_ref.rollout.calculate_log_probs=True \ + reward_model.reward_manager=dapo \ + +reward_model.reward_kwargs.overlong_buffer_cfg.enable=${enable_overlong_buffer} \ + +reward_model.reward_kwargs.overlong_buffer_cfg.len=${overlong_buffer_len} \ + +reward_model.reward_kwargs.overlong_buffer_cfg.penalty_factor=${overlong_penalty_factor} \ + +reward_model.reward_kwargs.overlong_buffer_cfg.log=False \ + +reward_model.reward_kwargs.max_resp_len=${max_response_length} \ + trainer.logger=['console','tensorboard'] \ + trainer.project_name="${project_name}" \ + trainer.experiment_name="${exp_name}" \ + trainer.val_before_train=True \ + trainer.save_freq=-1 \ + trainer.default_local_dir="${CKPTS_DIR}" \ + trainer.resume_mode=auto \ + trainer.nnodes="${NNODES_TRAIN}" \ + trainer.n_gpus_per_node="${NGPUS_PER_NODE}" \ + rollout.nnodes="${NNODES_ROLLOUT}" \ + rollout.n_gpus_per_node="${NGPUS_PER_NODE}" \ + rollout.total_rollout_steps="${total_rollout_steps}" \ + rollout.total_epochs=10 \ + rollout.test_freq="${test_freq}" \ + async_training.staleness_threshold="${staleness_threshold}" \ + async_training.trigger_parameter_sync_step="${trigger_parameter_sync_step}" \ + async_training.partial_rollout="${partial_rollout}" diff --git a/recipe/fully_async_policy/exp/qwen2-7B-math_64/fsdp2_fully-async_40-24/runtime_env.yaml b/recipe/fully_async_policy/exp/qwen2-7B-math_64/fsdp2_fully-async_40-24/runtime_env.yaml new file mode 100644 index 00000000000..93ae17ebb6f --- /dev/null +++ b/recipe/fully_async_policy/exp/qwen2-7B-math_64/fsdp2_fully-async_40-24/runtime_env.yaml @@ -0,0 +1,5 @@ +env_vars: + VLLM_USE_V1: "1" + TENSORBOARD_DIR: "/mnt/dolphinfs/ssd_pool/docker/user/hadoop-friday-studio/FTI/houzhenggang/expeirments/qwen2-7B-math-64/dapo_qwen2-7B-math_28k_fsdp2_fully-async_40-24_mbs32_tpf11" + NCCL_DEBUG: "INFO" + HYDRA_FULL_ERROR: "1" \ No newline at end of file diff --git a/recipe/fully_async_policy/exp/qwen2-7B-math_64/fsdp2_fully-async_8-24/dapo_qwen2-7B-math_28k_fsdp2_fully-async_8-24_mbs32_tpf_tfq20.sh b/recipe/fully_async_policy/exp/qwen2-7B-math_64/fsdp2_fully-async_8-24/dapo_qwen2-7B-math_28k_fsdp2_fully-async_8-24_mbs32_tpf_tfq20.sh deleted file mode 100644 index 618497c0257..00000000000 --- a/recipe/fully_async_policy/exp/qwen2-7B-math_64/fsdp2_fully-async_8-24/dapo_qwen2-7B-math_28k_fsdp2_fully-async_8-24_mbs32_tpf_tfq20.sh +++ /dev/null @@ -1,174 +0,0 @@ -#!/usr/bin/env bash -set -xeuo pipefail -# dapo_qwen2-7B-math_28k_fsdp2_fully-async_8-24_mbs32_tpf_tfq20 -project_name='DAPO' -exp_name='dapo_qwen2-7B-math_28k_fsdp2_fully-async_8-24_mbs32_tpf_tfq20' - -# Ray -# RAY_ADDRESS=${RAY_ADDRESS:-"http://localhost:8265"} -# WORKING_DIR=${WORKING_DIR:-"${PWD}"} -# RUNTIME_ENV=${RUNTIME_ENV:-"${WORKING_DIR}/verl/trainer/runtime_env.yaml"} -# Paths -RAY_DATA_HOME=${RAY_DATA_HOME:-"${HOME}/verl"} -# very important! please modify the max_position_embeddings in config.json to 32768 after downloading from huggingface -MODEL_PATH=${MODEL_PATH:-"${RAY_DATA_HOME}/models/Qwen2.5-Math-7B"} -CKPTS_DIR=${CKPTS_DIR:-"${RAY_DATA_HOME}/ckpts/${project_name}/${exp_name}"} -TRAIN_FILE=${TRAIN_FILE:-"${RAY_DATA_HOME}/data/dapo-math-17k.parquet"} -TEST_FILE=${TEST_FILE:-"${RAY_DATA_HOME}/data/aime-2024.parquet"} - -# MODEL_PATH=/mnt/dolphinfs/hdd_pool/docker/user/hadoop-djst-algoplat/houzhenggang/modelscope/hub/models/Qwen/Qwen2___5-Math-7B -MODEL_PATH=/mnt/dolphinfs/ssd_pool/docker/user/hadoop-friday-studio/FTI/houzhenggang/model/Qwen2___5-Math-7B -CKPTS_DIR=./ckpts/${project_name}/${exp_name} -# TRAIN_FILE=/mnt/dolphinfs/hdd_pool/docker/user/hadoop-djst-algoplat/houzhenggang/data/dapo/dapo-math-17k.parquet -# TEST_FILE=/mnt/dolphinfs/hdd_pool/docker/user/hadoop-djst-algoplat/houzhenggang/data/dapo/aime-2024.parquet -TRAIN_FILE=/mnt/dolphinfs/ssd_pool/docker/user/hadoop-friday-studio/FTI/houzhenggang/data/dapo/dapo-math-17k.parquet -TEST_FILE=/mnt/dolphinfs/ssd_pool/docker/user/hadoop-friday-studio/FTI/houzhenggang/data/dapo/aime-2024.parquet - -rollout_mode="async" -rollout_name="vllm" # sglang or vllm -if [ "$rollout_mode" = "async" ]; then - export VLLM_USE_V1=1 - return_raw_chat="True" -fi - -# Algorithm parameters -adv_estimator=grpo - -use_kl_in_reward=False -kl_coef=0.0 -use_kl_loss=False -kl_loss_coef=0.0 - -clip_ratio_low=0.2 -clip_ratio_high=0.28 - -# Response length parameters -max_prompt_length=$((1024 * 2)) -max_response_length=$((1024 *8)) -enable_overlong_buffer=True -overlong_buffer_len=$((1024 * 4)) -overlong_penalty_factor=1.0 - -# Training parameters -loss_agg_mode="token-mean" - -# Algorithm -temperature=1.0 -top_p=1.0 -top_k=-1 # 0 for HF rollout, -1 for vLLM rollout -val_top_p=0.7 - -# Performance Related Parameter -use_dynamic_bsz=True -actor_ppo_max_token_len=$(((max_prompt_length + max_response_length) * 2)) -infer_ppo_max_token_len=$(((max_prompt_length + max_response_length) * 3)) -ref_offload=True -actor_offload=False -gen_tp=1 -sp_size=1 -fsdp_size=2 - -# Fully async specific parameters -NNODES=${NNODES:-2} -NGPUS_PER_NODE=${NGPUS_PER_NODE:-8} - -n_gpus_rollout=6 -n_gpus_training=$((NGPUS_PER_NODE - n_gpus_rollout)) - -train_prompt_bsz=0 -gen_prompt_bsz=1 -n_resp_per_prompt=16 -train_prompt_mini_bsz=32 -total_rollout_steps=$(((512*100))) -test_freq=10 -staleness_threshold=1 -trigger_parameter_sync_step=64 -partial_rollout=True - -PYTHON_INTERPRETER="/home/hadoop-djst-algoplat/miniconda3/bin/python" -if [ ! -x "$PYTHON_INTERPRETER" ]; then - PYTHON_INTERPRETER="python3" -fi - -$PYTHON_INTERPRETER -m recipe.fully_async_policy.fully_async_main \ - data.train_files="${TRAIN_FILE}" \ - data.val_files="${TEST_FILE}" \ - data.prompt_key=prompt \ - data.truncation='left' \ - data.max_prompt_length=${max_prompt_length} \ - data.max_response_length=${max_response_length} \ - data.train_batch_size=${train_prompt_bsz} \ - data.gen_batch_size=${gen_prompt_bsz} \ - data.return_raw_chat=${return_raw_chat} \ - actor_rollout_ref.rollout.n=${n_resp_per_prompt} \ - algorithm.adv_estimator=${adv_estimator} \ - algorithm.use_kl_in_reward=${use_kl_in_reward} \ - algorithm.kl_ctrl.kl_coef=${kl_coef} \ - actor_rollout_ref.actor.strategy=fsdp2 \ - critic.strategy=fsdp2 \ - actor_rollout_ref.actor.use_kl_loss=${use_kl_loss} \ - actor_rollout_ref.actor.kl_loss_coef=${kl_loss_coef} \ - actor_rollout_ref.actor.clip_ratio_low=${clip_ratio_low} \ - actor_rollout_ref.actor.clip_ratio_high=${clip_ratio_high} \ - actor_rollout_ref.actor.clip_ratio_c=10.0 \ - actor_rollout_ref.model.use_remove_padding=True \ - actor_rollout_ref.hybrid_engine=False \ - +actor_rollout_ref.model.override_config.max_position_embeddings=32768 \ - actor_rollout_ref.actor.use_dynamic_bsz=${use_dynamic_bsz} \ - actor_rollout_ref.ref.log_prob_use_dynamic_bsz=${use_dynamic_bsz} \ - actor_rollout_ref.rollout.log_prob_use_dynamic_bsz=${use_dynamic_bsz} \ - actor_rollout_ref.actor.ppo_max_token_len_per_gpu=${actor_ppo_max_token_len} \ - actor_rollout_ref.ref.log_prob_max_token_len_per_gpu=${infer_ppo_max_token_len} \ - actor_rollout_ref.rollout.log_prob_max_token_len_per_gpu=${infer_ppo_max_token_len} \ - actor_rollout_ref.model.path="${MODEL_PATH}" \ - actor_rollout_ref.actor.optim.lr=1e-6 \ - actor_rollout_ref.actor.optim.lr_warmup_steps=10 \ - actor_rollout_ref.actor.optim.weight_decay=0.1 \ - actor_rollout_ref.actor.ppo_mini_batch_size=${train_prompt_mini_bsz} \ - actor_rollout_ref.actor.fsdp_config.param_offload=${actor_offload} \ - actor_rollout_ref.actor.fsdp_config.optimizer_offload=${actor_offload} \ - actor_rollout_ref.actor.entropy_coeff=0 \ - actor_rollout_ref.actor.grad_clip=1.0 \ - actor_rollout_ref.actor.loss_agg_mode=${loss_agg_mode} \ - actor_rollout_ref.actor.ulysses_sequence_parallel_size=${sp_size} \ - actor_rollout_ref.rollout.gpu_memory_utilization=0.80 \ - actor_rollout_ref.rollout.tensor_model_parallel_size=${gen_tp} \ - actor_rollout_ref.rollout.enable_chunked_prefill=True \ - actor_rollout_ref.rollout.max_num_batched_tokens=$((max_prompt_length + max_response_length)) \ - actor_rollout_ref.rollout.temperature=${temperature} \ - actor_rollout_ref.rollout.top_p=${top_p} \ - actor_rollout_ref.rollout.top_k=${top_k} \ - actor_rollout_ref.rollout.val_kwargs.temperature=${temperature} \ - actor_rollout_ref.rollout.val_kwargs.top_p=${val_top_p} \ - actor_rollout_ref.rollout.val_kwargs.top_k=${top_k} \ - actor_rollout_ref.rollout.val_kwargs.do_sample=True \ - actor_rollout_ref.rollout.val_kwargs.n=1 \ - actor_rollout_ref.ref.fsdp_config.param_offload=${ref_offload} \ - actor_rollout_ref.ref.ulysses_sequence_parallel_size=${sp_size} \ - actor_rollout_ref.actor.fsdp_config.fsdp_size=${fsdp_size} \ - actor_rollout_ref.rollout.name=${rollout_name} \ - actor_rollout_ref.rollout.mode=${rollout_mode} \ - actor_rollout_ref.rollout.calculate_log_probs=True \ - reward_model.reward_manager=dapo \ - +reward_model.reward_kwargs.overlong_buffer_cfg.enable=${enable_overlong_buffer} \ - +reward_model.reward_kwargs.overlong_buffer_cfg.len=${overlong_buffer_len} \ - +reward_model.reward_kwargs.overlong_buffer_cfg.penalty_factor=${overlong_penalty_factor} \ - +reward_model.reward_kwargs.overlong_buffer_cfg.log=False \ - +reward_model.reward_kwargs.max_resp_len=${max_response_length} \ - trainer.logger=['console','tensorboard'] \ - trainer.project_name="${project_name}" \ - trainer.experiment_name="${exp_name}" \ - trainer.val_before_train=True \ - trainer.save_freq=-1 \ - trainer.default_local_dir="${CKPTS_DIR}" \ - trainer.resume_mode=auto \ - trainer.nnodes="${NNODES}" \ - trainer.n_gpus_per_node="${n_gpus_training}" \ - rollout.nnodes="${NNODES}" \ - rollout.n_gpus_per_node="${n_gpus_rollout}" \ - rollout.total_rollout_steps="${total_rollout_steps}" \ - rollout.total_epochs=10 \ - rollout.test_freq="${test_freq}" \ - async_training.staleness_threshold="${staleness_threshold}" \ - async_training.trigger_parameter_sync_step="${trigger_parameter_sync_step}" \ - async_training.partial_rollout="${partial_rollout}" diff --git a/recipe/fully_async_policy/exp/qwen2-7B-math_64/fsdp2_fully-async_8-24/runtime_env.yaml b/recipe/fully_async_policy/exp/qwen2-7B-math_64/fsdp2_fully-async_8-24/runtime_env.yaml deleted file mode 100644 index dcca08e67f7..00000000000 --- a/recipe/fully_async_policy/exp/qwen2-7B-math_64/fsdp2_fully-async_8-24/runtime_env.yaml +++ /dev/null @@ -1,5 +0,0 @@ -env_vars: - VLLM_USE_V1: "1" - TENSORBOARD_DIR: "/mnt/dolphinfs/hdd_pool/docker/user/hadoop-djst-algoplat/houzhenggang/tensorboard/dapo_7b_math_fsdp2_4_12" - NCCL_DEBUG: "INFO" - HYDRA_FULL_ERROR: "1" \ No newline at end of file From 07ae4a00ae30faa8275f71d3a95521cceaf3effe Mon Sep 17 00:00:00 2001 From: wangshulin02 <953550366@qq.com> Date: Mon, 8 Sep 2025 17:26:01 +0800 Subject: [PATCH 110/182] add empty_cache after sync_rollout_weights --- recipe/one_step_off_policy/fsdp_workers.py | 1 + 1 file changed, 1 insertion(+) diff --git a/recipe/one_step_off_policy/fsdp_workers.py b/recipe/one_step_off_policy/fsdp_workers.py index 086f109e434..dd941c26684 100644 --- a/recipe/one_step_off_policy/fsdp_workers.py +++ b/recipe/one_step_off_policy/fsdp_workers.py @@ -100,6 +100,7 @@ def sync_rollout_weights(self): collective.broadcast(tensor, src_rank=0, group_name="actor_rollout") if self._is_rollout: inference_model.load_weights([(key, tensor)]) + get_torch_device().empty_cache() @register(dispatch_mode=Dispatch.ONE_TO_ALL) def get_actor_weights_info(self): From 34cf9e7b91fbabb6761a74047bf33fe13bfb5318 Mon Sep 17 00:00:00 2001 From: wangshulin02 <953550366@qq.com> Date: Mon, 8 Sep 2025 17:42:01 +0800 Subject: [PATCH 111/182] add exp fully_async 128 64-64 --- .../fsdp2_fully-async_16-16/runtime_env.yaml | 5 - ..._fsdp2_fully-async_8-24_mbs32_tpf_tfq20.sh | 174 ------------------ .../fsdp2_fully-async_24-8/runtime_env.yaml | 5 - ...28k_fsdp2_fully-async_64-64_mbs32_tfq4.sh} | 36 ++-- .../fsdp2_fully-async_64-64/runtime_env.yaml | 4 + ..._fsdp2_fully-async_8-24_mbs32_tpf_tfq20.sh | 174 ------------------ .../fsdp2_fully-async_8-24/runtime_env.yaml | 5 - 7 files changed, 19 insertions(+), 384 deletions(-) delete mode 100644 recipe/fully_async_policy/exp/qwen2-7B-math_128/fsdp2_fully-async_16-16/runtime_env.yaml delete mode 100644 recipe/fully_async_policy/exp/qwen2-7B-math_128/fsdp2_fully-async_24-8/dapo_qwen2-7B-math_28k_fsdp2_fully-async_8-24_mbs32_tpf_tfq20.sh delete mode 100644 recipe/fully_async_policy/exp/qwen2-7B-math_128/fsdp2_fully-async_24-8/runtime_env.yaml rename recipe/fully_async_policy/exp/qwen2-7B-math_128/{fsdp2_fully-async_16-16/dapo_qwen2-7B-math_28k_fsdp2_fully-async_8-24_mbs32_tpf_tfq20.sh => fsdp2_fully-async_64-64/dapo_qwen2-7B-math_28k_fsdp2_fully-async_64-64_mbs32_tfq4.sh} (90%) create mode 100644 recipe/fully_async_policy/exp/qwen2-7B-math_128/fsdp2_fully-async_64-64/runtime_env.yaml delete mode 100644 recipe/fully_async_policy/exp/qwen2-7B-math_128/fsdp2_fully-async_8-24/dapo_qwen2-7B-math_28k_fsdp2_fully-async_8-24_mbs32_tpf_tfq20.sh delete mode 100644 recipe/fully_async_policy/exp/qwen2-7B-math_128/fsdp2_fully-async_8-24/runtime_env.yaml diff --git a/recipe/fully_async_policy/exp/qwen2-7B-math_128/fsdp2_fully-async_16-16/runtime_env.yaml b/recipe/fully_async_policy/exp/qwen2-7B-math_128/fsdp2_fully-async_16-16/runtime_env.yaml deleted file mode 100644 index dcca08e67f7..00000000000 --- a/recipe/fully_async_policy/exp/qwen2-7B-math_128/fsdp2_fully-async_16-16/runtime_env.yaml +++ /dev/null @@ -1,5 +0,0 @@ -env_vars: - VLLM_USE_V1: "1" - TENSORBOARD_DIR: "/mnt/dolphinfs/hdd_pool/docker/user/hadoop-djst-algoplat/houzhenggang/tensorboard/dapo_7b_math_fsdp2_4_12" - NCCL_DEBUG: "INFO" - HYDRA_FULL_ERROR: "1" \ No newline at end of file diff --git a/recipe/fully_async_policy/exp/qwen2-7B-math_128/fsdp2_fully-async_24-8/dapo_qwen2-7B-math_28k_fsdp2_fully-async_8-24_mbs32_tpf_tfq20.sh b/recipe/fully_async_policy/exp/qwen2-7B-math_128/fsdp2_fully-async_24-8/dapo_qwen2-7B-math_28k_fsdp2_fully-async_8-24_mbs32_tpf_tfq20.sh deleted file mode 100644 index 618497c0257..00000000000 --- a/recipe/fully_async_policy/exp/qwen2-7B-math_128/fsdp2_fully-async_24-8/dapo_qwen2-7B-math_28k_fsdp2_fully-async_8-24_mbs32_tpf_tfq20.sh +++ /dev/null @@ -1,174 +0,0 @@ -#!/usr/bin/env bash -set -xeuo pipefail -# dapo_qwen2-7B-math_28k_fsdp2_fully-async_8-24_mbs32_tpf_tfq20 -project_name='DAPO' -exp_name='dapo_qwen2-7B-math_28k_fsdp2_fully-async_8-24_mbs32_tpf_tfq20' - -# Ray -# RAY_ADDRESS=${RAY_ADDRESS:-"http://localhost:8265"} -# WORKING_DIR=${WORKING_DIR:-"${PWD}"} -# RUNTIME_ENV=${RUNTIME_ENV:-"${WORKING_DIR}/verl/trainer/runtime_env.yaml"} -# Paths -RAY_DATA_HOME=${RAY_DATA_HOME:-"${HOME}/verl"} -# very important! please modify the max_position_embeddings in config.json to 32768 after downloading from huggingface -MODEL_PATH=${MODEL_PATH:-"${RAY_DATA_HOME}/models/Qwen2.5-Math-7B"} -CKPTS_DIR=${CKPTS_DIR:-"${RAY_DATA_HOME}/ckpts/${project_name}/${exp_name}"} -TRAIN_FILE=${TRAIN_FILE:-"${RAY_DATA_HOME}/data/dapo-math-17k.parquet"} -TEST_FILE=${TEST_FILE:-"${RAY_DATA_HOME}/data/aime-2024.parquet"} - -# MODEL_PATH=/mnt/dolphinfs/hdd_pool/docker/user/hadoop-djst-algoplat/houzhenggang/modelscope/hub/models/Qwen/Qwen2___5-Math-7B -MODEL_PATH=/mnt/dolphinfs/ssd_pool/docker/user/hadoop-friday-studio/FTI/houzhenggang/model/Qwen2___5-Math-7B -CKPTS_DIR=./ckpts/${project_name}/${exp_name} -# TRAIN_FILE=/mnt/dolphinfs/hdd_pool/docker/user/hadoop-djst-algoplat/houzhenggang/data/dapo/dapo-math-17k.parquet -# TEST_FILE=/mnt/dolphinfs/hdd_pool/docker/user/hadoop-djst-algoplat/houzhenggang/data/dapo/aime-2024.parquet -TRAIN_FILE=/mnt/dolphinfs/ssd_pool/docker/user/hadoop-friday-studio/FTI/houzhenggang/data/dapo/dapo-math-17k.parquet -TEST_FILE=/mnt/dolphinfs/ssd_pool/docker/user/hadoop-friday-studio/FTI/houzhenggang/data/dapo/aime-2024.parquet - -rollout_mode="async" -rollout_name="vllm" # sglang or vllm -if [ "$rollout_mode" = "async" ]; then - export VLLM_USE_V1=1 - return_raw_chat="True" -fi - -# Algorithm parameters -adv_estimator=grpo - -use_kl_in_reward=False -kl_coef=0.0 -use_kl_loss=False -kl_loss_coef=0.0 - -clip_ratio_low=0.2 -clip_ratio_high=0.28 - -# Response length parameters -max_prompt_length=$((1024 * 2)) -max_response_length=$((1024 *8)) -enable_overlong_buffer=True -overlong_buffer_len=$((1024 * 4)) -overlong_penalty_factor=1.0 - -# Training parameters -loss_agg_mode="token-mean" - -# Algorithm -temperature=1.0 -top_p=1.0 -top_k=-1 # 0 for HF rollout, -1 for vLLM rollout -val_top_p=0.7 - -# Performance Related Parameter -use_dynamic_bsz=True -actor_ppo_max_token_len=$(((max_prompt_length + max_response_length) * 2)) -infer_ppo_max_token_len=$(((max_prompt_length + max_response_length) * 3)) -ref_offload=True -actor_offload=False -gen_tp=1 -sp_size=1 -fsdp_size=2 - -# Fully async specific parameters -NNODES=${NNODES:-2} -NGPUS_PER_NODE=${NGPUS_PER_NODE:-8} - -n_gpus_rollout=6 -n_gpus_training=$((NGPUS_PER_NODE - n_gpus_rollout)) - -train_prompt_bsz=0 -gen_prompt_bsz=1 -n_resp_per_prompt=16 -train_prompt_mini_bsz=32 -total_rollout_steps=$(((512*100))) -test_freq=10 -staleness_threshold=1 -trigger_parameter_sync_step=64 -partial_rollout=True - -PYTHON_INTERPRETER="/home/hadoop-djst-algoplat/miniconda3/bin/python" -if [ ! -x "$PYTHON_INTERPRETER" ]; then - PYTHON_INTERPRETER="python3" -fi - -$PYTHON_INTERPRETER -m recipe.fully_async_policy.fully_async_main \ - data.train_files="${TRAIN_FILE}" \ - data.val_files="${TEST_FILE}" \ - data.prompt_key=prompt \ - data.truncation='left' \ - data.max_prompt_length=${max_prompt_length} \ - data.max_response_length=${max_response_length} \ - data.train_batch_size=${train_prompt_bsz} \ - data.gen_batch_size=${gen_prompt_bsz} \ - data.return_raw_chat=${return_raw_chat} \ - actor_rollout_ref.rollout.n=${n_resp_per_prompt} \ - algorithm.adv_estimator=${adv_estimator} \ - algorithm.use_kl_in_reward=${use_kl_in_reward} \ - algorithm.kl_ctrl.kl_coef=${kl_coef} \ - actor_rollout_ref.actor.strategy=fsdp2 \ - critic.strategy=fsdp2 \ - actor_rollout_ref.actor.use_kl_loss=${use_kl_loss} \ - actor_rollout_ref.actor.kl_loss_coef=${kl_loss_coef} \ - actor_rollout_ref.actor.clip_ratio_low=${clip_ratio_low} \ - actor_rollout_ref.actor.clip_ratio_high=${clip_ratio_high} \ - actor_rollout_ref.actor.clip_ratio_c=10.0 \ - actor_rollout_ref.model.use_remove_padding=True \ - actor_rollout_ref.hybrid_engine=False \ - +actor_rollout_ref.model.override_config.max_position_embeddings=32768 \ - actor_rollout_ref.actor.use_dynamic_bsz=${use_dynamic_bsz} \ - actor_rollout_ref.ref.log_prob_use_dynamic_bsz=${use_dynamic_bsz} \ - actor_rollout_ref.rollout.log_prob_use_dynamic_bsz=${use_dynamic_bsz} \ - actor_rollout_ref.actor.ppo_max_token_len_per_gpu=${actor_ppo_max_token_len} \ - actor_rollout_ref.ref.log_prob_max_token_len_per_gpu=${infer_ppo_max_token_len} \ - actor_rollout_ref.rollout.log_prob_max_token_len_per_gpu=${infer_ppo_max_token_len} \ - actor_rollout_ref.model.path="${MODEL_PATH}" \ - actor_rollout_ref.actor.optim.lr=1e-6 \ - actor_rollout_ref.actor.optim.lr_warmup_steps=10 \ - actor_rollout_ref.actor.optim.weight_decay=0.1 \ - actor_rollout_ref.actor.ppo_mini_batch_size=${train_prompt_mini_bsz} \ - actor_rollout_ref.actor.fsdp_config.param_offload=${actor_offload} \ - actor_rollout_ref.actor.fsdp_config.optimizer_offload=${actor_offload} \ - actor_rollout_ref.actor.entropy_coeff=0 \ - actor_rollout_ref.actor.grad_clip=1.0 \ - actor_rollout_ref.actor.loss_agg_mode=${loss_agg_mode} \ - actor_rollout_ref.actor.ulysses_sequence_parallel_size=${sp_size} \ - actor_rollout_ref.rollout.gpu_memory_utilization=0.80 \ - actor_rollout_ref.rollout.tensor_model_parallel_size=${gen_tp} \ - actor_rollout_ref.rollout.enable_chunked_prefill=True \ - actor_rollout_ref.rollout.max_num_batched_tokens=$((max_prompt_length + max_response_length)) \ - actor_rollout_ref.rollout.temperature=${temperature} \ - actor_rollout_ref.rollout.top_p=${top_p} \ - actor_rollout_ref.rollout.top_k=${top_k} \ - actor_rollout_ref.rollout.val_kwargs.temperature=${temperature} \ - actor_rollout_ref.rollout.val_kwargs.top_p=${val_top_p} \ - actor_rollout_ref.rollout.val_kwargs.top_k=${top_k} \ - actor_rollout_ref.rollout.val_kwargs.do_sample=True \ - actor_rollout_ref.rollout.val_kwargs.n=1 \ - actor_rollout_ref.ref.fsdp_config.param_offload=${ref_offload} \ - actor_rollout_ref.ref.ulysses_sequence_parallel_size=${sp_size} \ - actor_rollout_ref.actor.fsdp_config.fsdp_size=${fsdp_size} \ - actor_rollout_ref.rollout.name=${rollout_name} \ - actor_rollout_ref.rollout.mode=${rollout_mode} \ - actor_rollout_ref.rollout.calculate_log_probs=True \ - reward_model.reward_manager=dapo \ - +reward_model.reward_kwargs.overlong_buffer_cfg.enable=${enable_overlong_buffer} \ - +reward_model.reward_kwargs.overlong_buffer_cfg.len=${overlong_buffer_len} \ - +reward_model.reward_kwargs.overlong_buffer_cfg.penalty_factor=${overlong_penalty_factor} \ - +reward_model.reward_kwargs.overlong_buffer_cfg.log=False \ - +reward_model.reward_kwargs.max_resp_len=${max_response_length} \ - trainer.logger=['console','tensorboard'] \ - trainer.project_name="${project_name}" \ - trainer.experiment_name="${exp_name}" \ - trainer.val_before_train=True \ - trainer.save_freq=-1 \ - trainer.default_local_dir="${CKPTS_DIR}" \ - trainer.resume_mode=auto \ - trainer.nnodes="${NNODES}" \ - trainer.n_gpus_per_node="${n_gpus_training}" \ - rollout.nnodes="${NNODES}" \ - rollout.n_gpus_per_node="${n_gpus_rollout}" \ - rollout.total_rollout_steps="${total_rollout_steps}" \ - rollout.total_epochs=10 \ - rollout.test_freq="${test_freq}" \ - async_training.staleness_threshold="${staleness_threshold}" \ - async_training.trigger_parameter_sync_step="${trigger_parameter_sync_step}" \ - async_training.partial_rollout="${partial_rollout}" diff --git a/recipe/fully_async_policy/exp/qwen2-7B-math_128/fsdp2_fully-async_24-8/runtime_env.yaml b/recipe/fully_async_policy/exp/qwen2-7B-math_128/fsdp2_fully-async_24-8/runtime_env.yaml deleted file mode 100644 index dcca08e67f7..00000000000 --- a/recipe/fully_async_policy/exp/qwen2-7B-math_128/fsdp2_fully-async_24-8/runtime_env.yaml +++ /dev/null @@ -1,5 +0,0 @@ -env_vars: - VLLM_USE_V1: "1" - TENSORBOARD_DIR: "/mnt/dolphinfs/hdd_pool/docker/user/hadoop-djst-algoplat/houzhenggang/tensorboard/dapo_7b_math_fsdp2_4_12" - NCCL_DEBUG: "INFO" - HYDRA_FULL_ERROR: "1" \ No newline at end of file diff --git a/recipe/fully_async_policy/exp/qwen2-7B-math_128/fsdp2_fully-async_16-16/dapo_qwen2-7B-math_28k_fsdp2_fully-async_8-24_mbs32_tpf_tfq20.sh b/recipe/fully_async_policy/exp/qwen2-7B-math_128/fsdp2_fully-async_64-64/dapo_qwen2-7B-math_28k_fsdp2_fully-async_64-64_mbs32_tfq4.sh similarity index 90% rename from recipe/fully_async_policy/exp/qwen2-7B-math_128/fsdp2_fully-async_16-16/dapo_qwen2-7B-math_28k_fsdp2_fully-async_8-24_mbs32_tpf_tfq20.sh rename to recipe/fully_async_policy/exp/qwen2-7B-math_128/fsdp2_fully-async_64-64/dapo_qwen2-7B-math_28k_fsdp2_fully-async_64-64_mbs32_tfq4.sh index 618497c0257..ce69e60e2b6 100644 --- a/recipe/fully_async_policy/exp/qwen2-7B-math_128/fsdp2_fully-async_16-16/dapo_qwen2-7B-math_28k_fsdp2_fully-async_8-24_mbs32_tpf_tfq20.sh +++ b/recipe/fully_async_policy/exp/qwen2-7B-math_128/fsdp2_fully-async_64-64/dapo_qwen2-7B-math_28k_fsdp2_fully-async_64-64_mbs32_tfq4.sh @@ -1,8 +1,8 @@ #!/usr/bin/env bash set -xeuo pipefail -# dapo_qwen2-7B-math_28k_fsdp2_fully-async_8-24_mbs32_tpf_tfq20 + project_name='DAPO' -exp_name='dapo_qwen2-7B-math_28k_fsdp2_fully-async_8-24_mbs32_tpf_tfq20' +exp_name='dapo_qwen2-7B-math_28k_fsdp2_fully-async_64-64_mbs32_tpf4' # Ray # RAY_ADDRESS=${RAY_ADDRESS:-"http://localhost:8265"} @@ -44,7 +44,7 @@ clip_ratio_high=0.28 # Response length parameters max_prompt_length=$((1024 * 2)) -max_response_length=$((1024 *8)) +max_response_length=$((1024 * 28)) enable_overlong_buffer=True overlong_buffer_len=$((1024 * 4)) overlong_penalty_factor=1.0 @@ -64,33 +64,27 @@ actor_ppo_max_token_len=$(((max_prompt_length + max_response_length) * 2)) infer_ppo_max_token_len=$(((max_prompt_length + max_response_length) * 3)) ref_offload=True actor_offload=False -gen_tp=1 -sp_size=1 +gen_tp=4 +sp_size=4 fsdp_size=2 # Fully async specific parameters -NNODES=${NNODES:-2} +NNODES_ROLLOUT=${NNODES_ROLLOUT:-8} +NNODES_TRAIN=${NNODES_TRAIN:-8} NGPUS_PER_NODE=${NGPUS_PER_NODE:-8} -n_gpus_rollout=6 -n_gpus_training=$((NGPUS_PER_NODE - n_gpus_rollout)) train_prompt_bsz=0 gen_prompt_bsz=1 n_resp_per_prompt=16 train_prompt_mini_bsz=32 -total_rollout_steps=$(((512*100))) -test_freq=10 +total_rollout_steps=$(((512*400))) +test_freq=20 staleness_threshold=1 -trigger_parameter_sync_step=64 +trigger_parameter_sync_step=4 partial_rollout=True -PYTHON_INTERPRETER="/home/hadoop-djst-algoplat/miniconda3/bin/python" -if [ ! -x "$PYTHON_INTERPRETER" ]; then - PYTHON_INTERPRETER="python3" -fi - -$PYTHON_INTERPRETER -m recipe.fully_async_policy.fully_async_main \ +python -m recipe.fully_async_policy.fully_async_main \ data.train_files="${TRAIN_FILE}" \ data.val_files="${TEST_FILE}" \ data.prompt_key=prompt \ @@ -162,10 +156,10 @@ $PYTHON_INTERPRETER -m recipe.fully_async_policy.fully_async_main \ trainer.save_freq=-1 \ trainer.default_local_dir="${CKPTS_DIR}" \ trainer.resume_mode=auto \ - trainer.nnodes="${NNODES}" \ - trainer.n_gpus_per_node="${n_gpus_training}" \ - rollout.nnodes="${NNODES}" \ - rollout.n_gpus_per_node="${n_gpus_rollout}" \ + trainer.nnodes="${NNODES_TRAIN}" \ + trainer.n_gpus_per_node="${NGPUS_PER_NODE}" \ + rollout.nnodes="${NNODES_ROLLOUT}" \ + rollout.n_gpus_per_node="${NGPUS_PER_NODE}" \ rollout.total_rollout_steps="${total_rollout_steps}" \ rollout.total_epochs=10 \ rollout.test_freq="${test_freq}" \ diff --git a/recipe/fully_async_policy/exp/qwen2-7B-math_128/fsdp2_fully-async_64-64/runtime_env.yaml b/recipe/fully_async_policy/exp/qwen2-7B-math_128/fsdp2_fully-async_64-64/runtime_env.yaml new file mode 100644 index 00000000000..949fa4ef005 --- /dev/null +++ b/recipe/fully_async_policy/exp/qwen2-7B-math_128/fsdp2_fully-async_64-64/runtime_env.yaml @@ -0,0 +1,4 @@ +env_vars: + VLLM_USE_V1: "1" + TENSORBOARD_DIR: "/mnt/dolphinfs/ssd_pool/docker/user/hadoop-friday-studio/FTI/houzhenggang/expeirments/qwen2-7B-math-128/dapo_qwen2-7B-math_28k_fsdp2_fully-async_64-64_mbs32_tpf4" + HYDRA_FULL_ERROR: "1" \ No newline at end of file diff --git a/recipe/fully_async_policy/exp/qwen2-7B-math_128/fsdp2_fully-async_8-24/dapo_qwen2-7B-math_28k_fsdp2_fully-async_8-24_mbs32_tpf_tfq20.sh b/recipe/fully_async_policy/exp/qwen2-7B-math_128/fsdp2_fully-async_8-24/dapo_qwen2-7B-math_28k_fsdp2_fully-async_8-24_mbs32_tpf_tfq20.sh deleted file mode 100644 index 618497c0257..00000000000 --- a/recipe/fully_async_policy/exp/qwen2-7B-math_128/fsdp2_fully-async_8-24/dapo_qwen2-7B-math_28k_fsdp2_fully-async_8-24_mbs32_tpf_tfq20.sh +++ /dev/null @@ -1,174 +0,0 @@ -#!/usr/bin/env bash -set -xeuo pipefail -# dapo_qwen2-7B-math_28k_fsdp2_fully-async_8-24_mbs32_tpf_tfq20 -project_name='DAPO' -exp_name='dapo_qwen2-7B-math_28k_fsdp2_fully-async_8-24_mbs32_tpf_tfq20' - -# Ray -# RAY_ADDRESS=${RAY_ADDRESS:-"http://localhost:8265"} -# WORKING_DIR=${WORKING_DIR:-"${PWD}"} -# RUNTIME_ENV=${RUNTIME_ENV:-"${WORKING_DIR}/verl/trainer/runtime_env.yaml"} -# Paths -RAY_DATA_HOME=${RAY_DATA_HOME:-"${HOME}/verl"} -# very important! please modify the max_position_embeddings in config.json to 32768 after downloading from huggingface -MODEL_PATH=${MODEL_PATH:-"${RAY_DATA_HOME}/models/Qwen2.5-Math-7B"} -CKPTS_DIR=${CKPTS_DIR:-"${RAY_DATA_HOME}/ckpts/${project_name}/${exp_name}"} -TRAIN_FILE=${TRAIN_FILE:-"${RAY_DATA_HOME}/data/dapo-math-17k.parquet"} -TEST_FILE=${TEST_FILE:-"${RAY_DATA_HOME}/data/aime-2024.parquet"} - -# MODEL_PATH=/mnt/dolphinfs/hdd_pool/docker/user/hadoop-djst-algoplat/houzhenggang/modelscope/hub/models/Qwen/Qwen2___5-Math-7B -MODEL_PATH=/mnt/dolphinfs/ssd_pool/docker/user/hadoop-friday-studio/FTI/houzhenggang/model/Qwen2___5-Math-7B -CKPTS_DIR=./ckpts/${project_name}/${exp_name} -# TRAIN_FILE=/mnt/dolphinfs/hdd_pool/docker/user/hadoop-djst-algoplat/houzhenggang/data/dapo/dapo-math-17k.parquet -# TEST_FILE=/mnt/dolphinfs/hdd_pool/docker/user/hadoop-djst-algoplat/houzhenggang/data/dapo/aime-2024.parquet -TRAIN_FILE=/mnt/dolphinfs/ssd_pool/docker/user/hadoop-friday-studio/FTI/houzhenggang/data/dapo/dapo-math-17k.parquet -TEST_FILE=/mnt/dolphinfs/ssd_pool/docker/user/hadoop-friday-studio/FTI/houzhenggang/data/dapo/aime-2024.parquet - -rollout_mode="async" -rollout_name="vllm" # sglang or vllm -if [ "$rollout_mode" = "async" ]; then - export VLLM_USE_V1=1 - return_raw_chat="True" -fi - -# Algorithm parameters -adv_estimator=grpo - -use_kl_in_reward=False -kl_coef=0.0 -use_kl_loss=False -kl_loss_coef=0.0 - -clip_ratio_low=0.2 -clip_ratio_high=0.28 - -# Response length parameters -max_prompt_length=$((1024 * 2)) -max_response_length=$((1024 *8)) -enable_overlong_buffer=True -overlong_buffer_len=$((1024 * 4)) -overlong_penalty_factor=1.0 - -# Training parameters -loss_agg_mode="token-mean" - -# Algorithm -temperature=1.0 -top_p=1.0 -top_k=-1 # 0 for HF rollout, -1 for vLLM rollout -val_top_p=0.7 - -# Performance Related Parameter -use_dynamic_bsz=True -actor_ppo_max_token_len=$(((max_prompt_length + max_response_length) * 2)) -infer_ppo_max_token_len=$(((max_prompt_length + max_response_length) * 3)) -ref_offload=True -actor_offload=False -gen_tp=1 -sp_size=1 -fsdp_size=2 - -# Fully async specific parameters -NNODES=${NNODES:-2} -NGPUS_PER_NODE=${NGPUS_PER_NODE:-8} - -n_gpus_rollout=6 -n_gpus_training=$((NGPUS_PER_NODE - n_gpus_rollout)) - -train_prompt_bsz=0 -gen_prompt_bsz=1 -n_resp_per_prompt=16 -train_prompt_mini_bsz=32 -total_rollout_steps=$(((512*100))) -test_freq=10 -staleness_threshold=1 -trigger_parameter_sync_step=64 -partial_rollout=True - -PYTHON_INTERPRETER="/home/hadoop-djst-algoplat/miniconda3/bin/python" -if [ ! -x "$PYTHON_INTERPRETER" ]; then - PYTHON_INTERPRETER="python3" -fi - -$PYTHON_INTERPRETER -m recipe.fully_async_policy.fully_async_main \ - data.train_files="${TRAIN_FILE}" \ - data.val_files="${TEST_FILE}" \ - data.prompt_key=prompt \ - data.truncation='left' \ - data.max_prompt_length=${max_prompt_length} \ - data.max_response_length=${max_response_length} \ - data.train_batch_size=${train_prompt_bsz} \ - data.gen_batch_size=${gen_prompt_bsz} \ - data.return_raw_chat=${return_raw_chat} \ - actor_rollout_ref.rollout.n=${n_resp_per_prompt} \ - algorithm.adv_estimator=${adv_estimator} \ - algorithm.use_kl_in_reward=${use_kl_in_reward} \ - algorithm.kl_ctrl.kl_coef=${kl_coef} \ - actor_rollout_ref.actor.strategy=fsdp2 \ - critic.strategy=fsdp2 \ - actor_rollout_ref.actor.use_kl_loss=${use_kl_loss} \ - actor_rollout_ref.actor.kl_loss_coef=${kl_loss_coef} \ - actor_rollout_ref.actor.clip_ratio_low=${clip_ratio_low} \ - actor_rollout_ref.actor.clip_ratio_high=${clip_ratio_high} \ - actor_rollout_ref.actor.clip_ratio_c=10.0 \ - actor_rollout_ref.model.use_remove_padding=True \ - actor_rollout_ref.hybrid_engine=False \ - +actor_rollout_ref.model.override_config.max_position_embeddings=32768 \ - actor_rollout_ref.actor.use_dynamic_bsz=${use_dynamic_bsz} \ - actor_rollout_ref.ref.log_prob_use_dynamic_bsz=${use_dynamic_bsz} \ - actor_rollout_ref.rollout.log_prob_use_dynamic_bsz=${use_dynamic_bsz} \ - actor_rollout_ref.actor.ppo_max_token_len_per_gpu=${actor_ppo_max_token_len} \ - actor_rollout_ref.ref.log_prob_max_token_len_per_gpu=${infer_ppo_max_token_len} \ - actor_rollout_ref.rollout.log_prob_max_token_len_per_gpu=${infer_ppo_max_token_len} \ - actor_rollout_ref.model.path="${MODEL_PATH}" \ - actor_rollout_ref.actor.optim.lr=1e-6 \ - actor_rollout_ref.actor.optim.lr_warmup_steps=10 \ - actor_rollout_ref.actor.optim.weight_decay=0.1 \ - actor_rollout_ref.actor.ppo_mini_batch_size=${train_prompt_mini_bsz} \ - actor_rollout_ref.actor.fsdp_config.param_offload=${actor_offload} \ - actor_rollout_ref.actor.fsdp_config.optimizer_offload=${actor_offload} \ - actor_rollout_ref.actor.entropy_coeff=0 \ - actor_rollout_ref.actor.grad_clip=1.0 \ - actor_rollout_ref.actor.loss_agg_mode=${loss_agg_mode} \ - actor_rollout_ref.actor.ulysses_sequence_parallel_size=${sp_size} \ - actor_rollout_ref.rollout.gpu_memory_utilization=0.80 \ - actor_rollout_ref.rollout.tensor_model_parallel_size=${gen_tp} \ - actor_rollout_ref.rollout.enable_chunked_prefill=True \ - actor_rollout_ref.rollout.max_num_batched_tokens=$((max_prompt_length + max_response_length)) \ - actor_rollout_ref.rollout.temperature=${temperature} \ - actor_rollout_ref.rollout.top_p=${top_p} \ - actor_rollout_ref.rollout.top_k=${top_k} \ - actor_rollout_ref.rollout.val_kwargs.temperature=${temperature} \ - actor_rollout_ref.rollout.val_kwargs.top_p=${val_top_p} \ - actor_rollout_ref.rollout.val_kwargs.top_k=${top_k} \ - actor_rollout_ref.rollout.val_kwargs.do_sample=True \ - actor_rollout_ref.rollout.val_kwargs.n=1 \ - actor_rollout_ref.ref.fsdp_config.param_offload=${ref_offload} \ - actor_rollout_ref.ref.ulysses_sequence_parallel_size=${sp_size} \ - actor_rollout_ref.actor.fsdp_config.fsdp_size=${fsdp_size} \ - actor_rollout_ref.rollout.name=${rollout_name} \ - actor_rollout_ref.rollout.mode=${rollout_mode} \ - actor_rollout_ref.rollout.calculate_log_probs=True \ - reward_model.reward_manager=dapo \ - +reward_model.reward_kwargs.overlong_buffer_cfg.enable=${enable_overlong_buffer} \ - +reward_model.reward_kwargs.overlong_buffer_cfg.len=${overlong_buffer_len} \ - +reward_model.reward_kwargs.overlong_buffer_cfg.penalty_factor=${overlong_penalty_factor} \ - +reward_model.reward_kwargs.overlong_buffer_cfg.log=False \ - +reward_model.reward_kwargs.max_resp_len=${max_response_length} \ - trainer.logger=['console','tensorboard'] \ - trainer.project_name="${project_name}" \ - trainer.experiment_name="${exp_name}" \ - trainer.val_before_train=True \ - trainer.save_freq=-1 \ - trainer.default_local_dir="${CKPTS_DIR}" \ - trainer.resume_mode=auto \ - trainer.nnodes="${NNODES}" \ - trainer.n_gpus_per_node="${n_gpus_training}" \ - rollout.nnodes="${NNODES}" \ - rollout.n_gpus_per_node="${n_gpus_rollout}" \ - rollout.total_rollout_steps="${total_rollout_steps}" \ - rollout.total_epochs=10 \ - rollout.test_freq="${test_freq}" \ - async_training.staleness_threshold="${staleness_threshold}" \ - async_training.trigger_parameter_sync_step="${trigger_parameter_sync_step}" \ - async_training.partial_rollout="${partial_rollout}" diff --git a/recipe/fully_async_policy/exp/qwen2-7B-math_128/fsdp2_fully-async_8-24/runtime_env.yaml b/recipe/fully_async_policy/exp/qwen2-7B-math_128/fsdp2_fully-async_8-24/runtime_env.yaml deleted file mode 100644 index dcca08e67f7..00000000000 --- a/recipe/fully_async_policy/exp/qwen2-7B-math_128/fsdp2_fully-async_8-24/runtime_env.yaml +++ /dev/null @@ -1,5 +0,0 @@ -env_vars: - VLLM_USE_V1: "1" - TENSORBOARD_DIR: "/mnt/dolphinfs/hdd_pool/docker/user/hadoop-djst-algoplat/houzhenggang/tensorboard/dapo_7b_math_fsdp2_4_12" - NCCL_DEBUG: "INFO" - HYDRA_FULL_ERROR: "1" \ No newline at end of file From a7c06551baf74e3774e66cecd78e8dfa34c5b116 Mon Sep 17 00:00:00 2001 From: wangshulin02 <953550366@qq.com> Date: Mon, 8 Sep 2025 21:55:57 +0800 Subject: [PATCH 112/182] fix max_concurrent_samples, fix progress_bar --- recipe/fully_async_policy/fully_async_rollouter.py | 4 +++- recipe/fully_async_policy/fully_async_trainer.py | 2 ++ 2 files changed, 5 insertions(+), 1 deletion(-) diff --git a/recipe/fully_async_policy/fully_async_rollouter.py b/recipe/fully_async_policy/fully_async_rollouter.py index f3a25c2c30c..5d612dc4679 100644 --- a/recipe/fully_async_policy/fully_async_rollouter.py +++ b/recipe/fully_async_policy/fully_async_rollouter.py @@ -164,7 +164,9 @@ async def set_required_samples(self, required_samples: int): ) # 单次最多扔一次更新需要的样本 - self.max_concurrent_samples = self.required_samples + self.max_concurrent_samples = int(self.config.actor_rollout_ref.actor.ppo_mini_batch_size / \ + self.config.actor_rollout_ref.rollout.n * self.async_rollout_manager.rollout_dp_size * 4) + self.max_concurrent_samples = min(self.max_concurrent_samples, self.max_required_samples) self.max_queue_size = self.max_required_samples print( diff --git a/recipe/fully_async_policy/fully_async_trainer.py b/recipe/fully_async_policy/fully_async_trainer.py index 0a200e76b1d..2b549c0b621 100644 --- a/recipe/fully_async_policy/fully_async_trainer.py +++ b/recipe/fully_async_policy/fully_async_trainer.py @@ -335,6 +335,7 @@ def fit(self): self.logger.log(data=val_data.metrics, step=val_data.param_version) self.logger.log(data=val_data.timing_raw, step=val_data.param_version) pprint(f"[FullyAsyncTrainer] Final validation metrics: {val_data.metrics}") + self.progress_bar.close() self._check_save_checkpoint(True, timing_raw) # TODO: 检查checkpoint @@ -356,6 +357,7 @@ def _trigger_parameter_sync_after_step(self, validate: bool = False, global_step data=self.metrics_aggregator.get_aggregated_metrics(), step=self.current_param_version, ) + self.progress_bar.update(1) self.metrics_aggregator.reset() ray.get(self.param_synchronizer.wait_last_sync.remote()) ray.get(self.param_synchronizer.sync_weights.remote(self.current_param_version, From 0e1f2d79cf2b0921c36bd4515e0df8b3327a02c4 Mon Sep 17 00:00:00 2001 From: wangshulin02 <953550366@qq.com> Date: Tue, 9 Sep 2025 00:31:02 +0800 Subject: [PATCH 113/182] change max_concurrent_samples num & change some exp --- ...apo_qwen2-7B-math_28k_fsdp2_fully-async_64-64_mbs32_tfq4.sh | 2 +- .../qwen2-7B-math_128/fsdp2_fully-async_64-64/runtime_env.yaml | 2 +- ...po_qwen2-7B-math_28k_fsdp2_fully-async_16-16_mbs32_tfq16.sh | 2 +- .../qwen2-7B-math_32/fsdp2_fully-async_16-16/runtime_env.yaml | 3 +-- ...apo_qwen2-7B-math_28k_fsdp2_fully-async_32-32_mbs32_tfq8.sh | 2 +- .../qwen2-7B-math_64/fsdp2_fully-async_32-32/runtime_env.yaml | 3 +-- recipe/fully_async_policy/fully_async_rollouter.py | 2 +- 7 files changed, 7 insertions(+), 9 deletions(-) diff --git a/recipe/fully_async_policy/exp/qwen2-7B-math_128/fsdp2_fully-async_64-64/dapo_qwen2-7B-math_28k_fsdp2_fully-async_64-64_mbs32_tfq4.sh b/recipe/fully_async_policy/exp/qwen2-7B-math_128/fsdp2_fully-async_64-64/dapo_qwen2-7B-math_28k_fsdp2_fully-async_64-64_mbs32_tfq4.sh index ce69e60e2b6..9f410f95c6c 100644 --- a/recipe/fully_async_policy/exp/qwen2-7B-math_128/fsdp2_fully-async_64-64/dapo_qwen2-7B-math_28k_fsdp2_fully-async_64-64_mbs32_tfq4.sh +++ b/recipe/fully_async_policy/exp/qwen2-7B-math_128/fsdp2_fully-async_64-64/dapo_qwen2-7B-math_28k_fsdp2_fully-async_64-64_mbs32_tfq4.sh @@ -2,7 +2,7 @@ set -xeuo pipefail project_name='DAPO' -exp_name='dapo_qwen2-7B-math_28k_fsdp2_fully-async_64-64_mbs32_tpf4' +exp_name='dapo_qwen2-7B-math_28k_fsdp2_fully-async_64-64_mbs32_tpf4_fixmcs' # Ray # RAY_ADDRESS=${RAY_ADDRESS:-"http://localhost:8265"} diff --git a/recipe/fully_async_policy/exp/qwen2-7B-math_128/fsdp2_fully-async_64-64/runtime_env.yaml b/recipe/fully_async_policy/exp/qwen2-7B-math_128/fsdp2_fully-async_64-64/runtime_env.yaml index 949fa4ef005..5dfe2294911 100644 --- a/recipe/fully_async_policy/exp/qwen2-7B-math_128/fsdp2_fully-async_64-64/runtime_env.yaml +++ b/recipe/fully_async_policy/exp/qwen2-7B-math_128/fsdp2_fully-async_64-64/runtime_env.yaml @@ -1,4 +1,4 @@ env_vars: VLLM_USE_V1: "1" - TENSORBOARD_DIR: "/mnt/dolphinfs/ssd_pool/docker/user/hadoop-friday-studio/FTI/houzhenggang/expeirments/qwen2-7B-math-128/dapo_qwen2-7B-math_28k_fsdp2_fully-async_64-64_mbs32_tpf4" + TENSORBOARD_DIR: "/mnt/dolphinfs/ssd_pool/docker/user/hadoop-friday-studio/FTI/houzhenggang/expeirments/qwen2-7B-math-128/dapo_qwen2-7B-math_28k_fsdp2_fully-async_64-64_mbs32_tpf4_fixmcs" HYDRA_FULL_ERROR: "1" \ No newline at end of file diff --git a/recipe/fully_async_policy/exp/qwen2-7B-math_32/fsdp2_fully-async_16-16/dapo_qwen2-7B-math_28k_fsdp2_fully-async_16-16_mbs32_tfq16.sh b/recipe/fully_async_policy/exp/qwen2-7B-math_32/fsdp2_fully-async_16-16/dapo_qwen2-7B-math_28k_fsdp2_fully-async_16-16_mbs32_tfq16.sh index c49a6460696..fcc5f472d8c 100644 --- a/recipe/fully_async_policy/exp/qwen2-7B-math_32/fsdp2_fully-async_16-16/dapo_qwen2-7B-math_28k_fsdp2_fully-async_16-16_mbs32_tfq16.sh +++ b/recipe/fully_async_policy/exp/qwen2-7B-math_32/fsdp2_fully-async_16-16/dapo_qwen2-7B-math_28k_fsdp2_fully-async_16-16_mbs32_tfq16.sh @@ -2,7 +2,7 @@ set -xeuo pipefail project_name='DAPO' -exp_name='dapo_qwen2-7B-math_28k_fsdp2_fully-async_16-16_mbs32_tpf16' +exp_name='dapo_qwen2-7B-math_28k_fsdp2_fully-async_16-16_mbs32_tpf16_fixmcs' # Ray # RAY_ADDRESS=${RAY_ADDRESS:-"http://localhost:8265"} diff --git a/recipe/fully_async_policy/exp/qwen2-7B-math_32/fsdp2_fully-async_16-16/runtime_env.yaml b/recipe/fully_async_policy/exp/qwen2-7B-math_32/fsdp2_fully-async_16-16/runtime_env.yaml index de7e1aa0e1c..b3063ebc7f1 100644 --- a/recipe/fully_async_policy/exp/qwen2-7B-math_32/fsdp2_fully-async_16-16/runtime_env.yaml +++ b/recipe/fully_async_policy/exp/qwen2-7B-math_32/fsdp2_fully-async_16-16/runtime_env.yaml @@ -1,5 +1,4 @@ env_vars: VLLM_USE_V1: "1" - TENSORBOARD_DIR: "/mnt/dolphinfs/ssd_pool/docker/user/hadoop-friday-studio/FTI/houzhenggang/expeirments/qwen2-7B-math/dapo_qwen2-7B-math_28k_fsdp2_fully-async_16-16_mbs32_tpf16" - NCCL_DEBUG: "INFO" + TENSORBOARD_DIR: "/mnt/dolphinfs/ssd_pool/docker/user/hadoop-friday-studio/FTI/houzhenggang/expeirments/qwen2-7B-math/dapo_qwen2-7B-math_28k_fsdp2_fully-async_16-16_mbs32_tpf16_fixmcs" HYDRA_FULL_ERROR: "1" \ No newline at end of file diff --git a/recipe/fully_async_policy/exp/qwen2-7B-math_64/fsdp2_fully-async_32-32/dapo_qwen2-7B-math_28k_fsdp2_fully-async_32-32_mbs32_tfq8.sh b/recipe/fully_async_policy/exp/qwen2-7B-math_64/fsdp2_fully-async_32-32/dapo_qwen2-7B-math_28k_fsdp2_fully-async_32-32_mbs32_tfq8.sh index c31c59df4db..221d3c4d5a6 100644 --- a/recipe/fully_async_policy/exp/qwen2-7B-math_64/fsdp2_fully-async_32-32/dapo_qwen2-7B-math_28k_fsdp2_fully-async_32-32_mbs32_tfq8.sh +++ b/recipe/fully_async_policy/exp/qwen2-7B-math_64/fsdp2_fully-async_32-32/dapo_qwen2-7B-math_28k_fsdp2_fully-async_32-32_mbs32_tfq8.sh @@ -2,7 +2,7 @@ set -xeuo pipefail project_name='DAPO' -exp_name='dapo_qwen2-7B-math_28k_fsdp2_fully-async_32-32_mbs32_tpf8' +exp_name='dapo_qwen2-7B-math_28k_fsdp2_fully-async_32-32_mbs32_tpf8_fixmcs' # Ray # RAY_ADDRESS=${RAY_ADDRESS:-"http://localhost:8265"} diff --git a/recipe/fully_async_policy/exp/qwen2-7B-math_64/fsdp2_fully-async_32-32/runtime_env.yaml b/recipe/fully_async_policy/exp/qwen2-7B-math_64/fsdp2_fully-async_32-32/runtime_env.yaml index 20d464776b0..160cd46c499 100644 --- a/recipe/fully_async_policy/exp/qwen2-7B-math_64/fsdp2_fully-async_32-32/runtime_env.yaml +++ b/recipe/fully_async_policy/exp/qwen2-7B-math_64/fsdp2_fully-async_32-32/runtime_env.yaml @@ -1,5 +1,4 @@ env_vars: VLLM_USE_V1: "1" - TENSORBOARD_DIR: "/mnt/dolphinfs/ssd_pool/docker/user/hadoop-friday-studio/FTI/houzhenggang/expeirments/qwen2-7B-math-64/dapo_qwen2-7B-math_28k_fsdp2_fully-async_32-32_mbs32_tpf8" - NCCL_DEBUG: "INFO" + TENSORBOARD_DIR: "/mnt/dolphinfs/ssd_pool/docker/user/hadoop-friday-studio/FTI/houzhenggang/expeirments/qwen2-7B-math-64/dapo_qwen2-7B-math_28k_fsdp2_fully-async_32-32_mbs32_tpf8_fixmcs" HYDRA_FULL_ERROR: "1" \ No newline at end of file diff --git a/recipe/fully_async_policy/fully_async_rollouter.py b/recipe/fully_async_policy/fully_async_rollouter.py index 5d612dc4679..35e199addcb 100644 --- a/recipe/fully_async_policy/fully_async_rollouter.py +++ b/recipe/fully_async_policy/fully_async_rollouter.py @@ -165,7 +165,7 @@ async def set_required_samples(self, required_samples: int): # 单次最多扔一次更新需要的样本 self.max_concurrent_samples = int(self.config.actor_rollout_ref.actor.ppo_mini_batch_size / \ - self.config.actor_rollout_ref.rollout.n * self.async_rollout_manager.rollout_dp_size * 4) + self.config.actor_rollout_ref.rollout.n * self.async_rollout_manager.rollout_dp_size * 8) self.max_concurrent_samples = min(self.max_concurrent_samples, self.max_required_samples) self.max_queue_size = self.max_required_samples From 6c557d61659516d840145a0f9f2ce9bf07f74c51 Mon Sep 17 00:00:00 2001 From: wangshulin02 <953550366@qq.com> Date: Tue, 9 Sep 2025 14:15:05 +0800 Subject: [PATCH 114/182] remove unused code, add stale 0.1 exp --- ...28k_fsdp2_fully-async_16-16_mbs32_tfq16.sh | 168 ++++++++++++++++++ .../runtime_env.yaml | 4 + .../fully_async_rollouter.py | 24 +-- recipe/fully_async_policy/message_queue.py | 7 - 4 files changed, 179 insertions(+), 24 deletions(-) create mode 100644 recipe/fully_async_policy/exp/qwen2-7B-math_32/fsdp2_fully-async_16-16_stal0.1/dapo_qwen2-7B-math_28k_fsdp2_fully-async_16-16_mbs32_tfq16.sh create mode 100644 recipe/fully_async_policy/exp/qwen2-7B-math_32/fsdp2_fully-async_16-16_stal0.1/runtime_env.yaml diff --git a/recipe/fully_async_policy/exp/qwen2-7B-math_32/fsdp2_fully-async_16-16_stal0.1/dapo_qwen2-7B-math_28k_fsdp2_fully-async_16-16_mbs32_tfq16.sh b/recipe/fully_async_policy/exp/qwen2-7B-math_32/fsdp2_fully-async_16-16_stal0.1/dapo_qwen2-7B-math_28k_fsdp2_fully-async_16-16_mbs32_tfq16.sh new file mode 100644 index 00000000000..2217661dd33 --- /dev/null +++ b/recipe/fully_async_policy/exp/qwen2-7B-math_32/fsdp2_fully-async_16-16_stal0.1/dapo_qwen2-7B-math_28k_fsdp2_fully-async_16-16_mbs32_tfq16.sh @@ -0,0 +1,168 @@ +#!/usr/bin/env bash +set -xeuo pipefail + +project_name='DAPO' +exp_name='dapo_qwen2-7B-math_28k_fsdp2_fully-async_16-16_mbs32_tpf16_stal0.1' + +# Ray +# RAY_ADDRESS=${RAY_ADDRESS:-"http://localhost:8265"} +# WORKING_DIR=${WORKING_DIR:-"${PWD}"} +# RUNTIME_ENV=${RUNTIME_ENV:-"${WORKING_DIR}/verl/trainer/runtime_env.yaml"} +# Paths +RAY_DATA_HOME=${RAY_DATA_HOME:-"${HOME}/verl"} +# very important! please modify the max_position_embeddings in config.json to 32768 after downloading from huggingface +MODEL_PATH=${MODEL_PATH:-"${RAY_DATA_HOME}/models/Qwen2.5-Math-7B"} +CKPTS_DIR=${CKPTS_DIR:-"${RAY_DATA_HOME}/ckpts/${project_name}/${exp_name}"} +TRAIN_FILE=${TRAIN_FILE:-"${RAY_DATA_HOME}/data/dapo-math-17k.parquet"} +TEST_FILE=${TEST_FILE:-"${RAY_DATA_HOME}/data/aime-2024.parquet"} + +# MODEL_PATH=/mnt/dolphinfs/hdd_pool/docker/user/hadoop-djst-algoplat/houzhenggang/modelscope/hub/models/Qwen/Qwen2___5-Math-7B +MODEL_PATH=/mnt/dolphinfs/ssd_pool/docker/user/hadoop-friday-studio/FTI/houzhenggang/model/Qwen2___5-Math-7B +CKPTS_DIR=./ckpts/${project_name}/${exp_name} +# TRAIN_FILE=/mnt/dolphinfs/hdd_pool/docker/user/hadoop-djst-algoplat/houzhenggang/data/dapo/dapo-math-17k.parquet +# TEST_FILE=/mnt/dolphinfs/hdd_pool/docker/user/hadoop-djst-algoplat/houzhenggang/data/dapo/aime-2024.parquet +TRAIN_FILE=/mnt/dolphinfs/ssd_pool/docker/user/hadoop-friday-studio/FTI/houzhenggang/data/dapo/dapo-math-17k.parquet +TEST_FILE=/mnt/dolphinfs/ssd_pool/docker/user/hadoop-friday-studio/FTI/houzhenggang/data/dapo/aime-2024.parquet + +rollout_mode="async" +rollout_name="vllm" # sglang or vllm +if [ "$rollout_mode" = "async" ]; then + export VLLM_USE_V1=1 + return_raw_chat="True" +fi + +# Algorithm parameters +adv_estimator=grpo + +use_kl_in_reward=False +kl_coef=0.0 +use_kl_loss=False +kl_loss_coef=0.0 + +clip_ratio_low=0.2 +clip_ratio_high=0.28 + +# Response length parameters +max_prompt_length=$((1024 * 2)) +max_response_length=$((1024 * 28)) +enable_overlong_buffer=True +overlong_buffer_len=$((1024 * 4)) +overlong_penalty_factor=1.0 + +# Training parameters +loss_agg_mode="token-mean" + +# Algorithm +temperature=1.0 +top_p=1.0 +top_k=-1 # 0 for HF rollout, -1 for vLLM rollout +val_top_p=0.7 + +# Performance Related Parameter +use_dynamic_bsz=True +actor_ppo_max_token_len=$(((max_prompt_length + max_response_length) * 2)) +infer_ppo_max_token_len=$(((max_prompt_length + max_response_length) * 3)) +ref_offload=True +actor_offload=False +gen_tp=4 +sp_size=4 +fsdp_size=2 + +# Fully async specific parameters +NNODES_ROLLOUT=${NNODES_ROLLOUT:-2} +NNODES_TRAIN=${NNODES_TRAIN:-2} +NGPUS_PER_NODE=${NGPUS_PER_NODE:-8} + + +train_prompt_bsz=0 +gen_prompt_bsz=1 +n_resp_per_prompt=16 +train_prompt_mini_bsz=32 +total_rollout_steps=$(((512*400))) +test_freq=20 +staleness_threshold=0.1 +trigger_parameter_sync_step=16 +partial_rollout=True + +python -m recipe.fully_async_policy.fully_async_main \ + data.train_files="${TRAIN_FILE}" \ + data.val_files="${TEST_FILE}" \ + data.prompt_key=prompt \ + data.truncation='left' \ + data.max_prompt_length=${max_prompt_length} \ + data.max_response_length=${max_response_length} \ + data.train_batch_size=${train_prompt_bsz} \ + data.gen_batch_size=${gen_prompt_bsz} \ + data.return_raw_chat=${return_raw_chat} \ + actor_rollout_ref.rollout.n=${n_resp_per_prompt} \ + algorithm.adv_estimator=${adv_estimator} \ + algorithm.use_kl_in_reward=${use_kl_in_reward} \ + algorithm.kl_ctrl.kl_coef=${kl_coef} \ + actor_rollout_ref.actor.strategy=fsdp2 \ + critic.strategy=fsdp2 \ + actor_rollout_ref.actor.use_kl_loss=${use_kl_loss} \ + actor_rollout_ref.actor.kl_loss_coef=${kl_loss_coef} \ + actor_rollout_ref.actor.clip_ratio_low=${clip_ratio_low} \ + actor_rollout_ref.actor.clip_ratio_high=${clip_ratio_high} \ + actor_rollout_ref.actor.clip_ratio_c=10.0 \ + actor_rollout_ref.model.use_remove_padding=True \ + actor_rollout_ref.hybrid_engine=False \ + +actor_rollout_ref.model.override_config.max_position_embeddings=32768 \ + actor_rollout_ref.actor.use_dynamic_bsz=${use_dynamic_bsz} \ + actor_rollout_ref.ref.log_prob_use_dynamic_bsz=${use_dynamic_bsz} \ + actor_rollout_ref.rollout.log_prob_use_dynamic_bsz=${use_dynamic_bsz} \ + actor_rollout_ref.actor.ppo_max_token_len_per_gpu=${actor_ppo_max_token_len} \ + actor_rollout_ref.ref.log_prob_max_token_len_per_gpu=${infer_ppo_max_token_len} \ + actor_rollout_ref.rollout.log_prob_max_token_len_per_gpu=${infer_ppo_max_token_len} \ + actor_rollout_ref.model.path="${MODEL_PATH}" \ + actor_rollout_ref.actor.optim.lr=1e-6 \ + actor_rollout_ref.actor.optim.lr_warmup_steps=10 \ + actor_rollout_ref.actor.optim.weight_decay=0.1 \ + actor_rollout_ref.actor.ppo_mini_batch_size=${train_prompt_mini_bsz} \ + actor_rollout_ref.actor.fsdp_config.param_offload=${actor_offload} \ + actor_rollout_ref.actor.fsdp_config.optimizer_offload=${actor_offload} \ + actor_rollout_ref.actor.entropy_coeff=0 \ + actor_rollout_ref.actor.grad_clip=1.0 \ + actor_rollout_ref.actor.loss_agg_mode=${loss_agg_mode} \ + actor_rollout_ref.actor.ulysses_sequence_parallel_size=${sp_size} \ + actor_rollout_ref.rollout.gpu_memory_utilization=0.80 \ + actor_rollout_ref.rollout.tensor_model_parallel_size=${gen_tp} \ + actor_rollout_ref.rollout.enable_chunked_prefill=True \ + actor_rollout_ref.rollout.max_num_batched_tokens=$((max_prompt_length + max_response_length)) \ + actor_rollout_ref.rollout.temperature=${temperature} \ + actor_rollout_ref.rollout.top_p=${top_p} \ + actor_rollout_ref.rollout.top_k=${top_k} \ + actor_rollout_ref.rollout.val_kwargs.temperature=${temperature} \ + actor_rollout_ref.rollout.val_kwargs.top_p=${val_top_p} \ + actor_rollout_ref.rollout.val_kwargs.top_k=${top_k} \ + actor_rollout_ref.rollout.val_kwargs.do_sample=True \ + actor_rollout_ref.rollout.val_kwargs.n=1 \ + actor_rollout_ref.ref.fsdp_config.param_offload=${ref_offload} \ + actor_rollout_ref.ref.ulysses_sequence_parallel_size=${sp_size} \ + actor_rollout_ref.actor.fsdp_config.fsdp_size=${fsdp_size} \ + actor_rollout_ref.rollout.name=${rollout_name} \ + actor_rollout_ref.rollout.mode=${rollout_mode} \ + actor_rollout_ref.rollout.calculate_log_probs=True \ + reward_model.reward_manager=dapo \ + +reward_model.reward_kwargs.overlong_buffer_cfg.enable=${enable_overlong_buffer} \ + +reward_model.reward_kwargs.overlong_buffer_cfg.len=${overlong_buffer_len} \ + +reward_model.reward_kwargs.overlong_buffer_cfg.penalty_factor=${overlong_penalty_factor} \ + +reward_model.reward_kwargs.overlong_buffer_cfg.log=False \ + +reward_model.reward_kwargs.max_resp_len=${max_response_length} \ + trainer.logger=['console','tensorboard'] \ + trainer.project_name="${project_name}" \ + trainer.experiment_name="${exp_name}" \ + trainer.val_before_train=True \ + trainer.save_freq=-1 \ + trainer.default_local_dir="${CKPTS_DIR}" \ + trainer.resume_mode=auto \ + trainer.nnodes="${NNODES_TRAIN}" \ + trainer.n_gpus_per_node="${NGPUS_PER_NODE}" \ + rollout.nnodes="${NNODES_ROLLOUT}" \ + rollout.n_gpus_per_node="${NGPUS_PER_NODE}" \ + rollout.total_rollout_steps="${total_rollout_steps}" \ + rollout.total_epochs=10 \ + rollout.test_freq="${test_freq}" \ + async_training.staleness_threshold="${staleness_threshold}" \ + async_training.trigger_parameter_sync_step="${trigger_parameter_sync_step}" \ + async_training.partial_rollout="${partial_rollout}" diff --git a/recipe/fully_async_policy/exp/qwen2-7B-math_32/fsdp2_fully-async_16-16_stal0.1/runtime_env.yaml b/recipe/fully_async_policy/exp/qwen2-7B-math_32/fsdp2_fully-async_16-16_stal0.1/runtime_env.yaml new file mode 100644 index 00000000000..0b188206127 --- /dev/null +++ b/recipe/fully_async_policy/exp/qwen2-7B-math_32/fsdp2_fully-async_16-16_stal0.1/runtime_env.yaml @@ -0,0 +1,4 @@ +env_vars: + VLLM_USE_V1: "1" + TENSORBOARD_DIR: "/mnt/dolphinfs/ssd_pool/docker/user/hadoop-friday-studio/FTI/houzhenggang/expeirments/qwen2-7B-math/dapo_qwen2-7B-math_28k_fsdp2_fully-async_16-16_mbs32_tpf16_stal0.1" + HYDRA_FULL_ERROR: "1" \ No newline at end of file diff --git a/recipe/fully_async_policy/fully_async_rollouter.py b/recipe/fully_async_policy/fully_async_rollouter.py index 35e199addcb..c25a52abbe0 100644 --- a/recipe/fully_async_policy/fully_async_rollouter.py +++ b/recipe/fully_async_policy/fully_async_rollouter.py @@ -113,7 +113,7 @@ def __init__( self.async_rollout_manager = None # Config - self.staleness_threshold: int = config.async_training.get("staleness_threshold", 1) + self.staleness_threshold: float = config.async_training.get("staleness_threshold", 1) self.required_samples = None self.max_required_samples = None # 单次最多扔一次更新需要的样本 @@ -153,7 +153,7 @@ async def set_message_queue_client(self, message_queue_client: MessageQueueClien async def set_required_samples(self, required_samples: int): async with self.lock: self.required_samples = int(required_samples) - self.max_required_samples = ( + self.max_required_samples = int( self.required_samples * (self.staleness_threshold + 1) * self.config.async_training.trigger_parameter_sync_step @@ -164,8 +164,11 @@ async def set_required_samples(self, required_samples: int): ) # 单次最多扔一次更新需要的样本 - self.max_concurrent_samples = int(self.config.actor_rollout_ref.actor.ppo_mini_batch_size / \ - self.config.actor_rollout_ref.rollout.n * self.async_rollout_manager.rollout_dp_size * 8) + self.max_concurrent_samples = int( + self.config.actor_rollout_ref.actor.ppo_mini_batch_size + / self.config.actor_rollout_ref.rollout.n + * self.async_rollout_manager.rollout_dp_size * 8 + ) self.max_concurrent_samples = min(self.max_concurrent_samples, self.max_required_samples) self.max_queue_size = self.max_required_samples @@ -548,19 +551,6 @@ async def _should_pause_generation(self) -> bool: """Determine whether the build should be paused""" queue_stats = self.message_queue_client.get_statistics_sync() queue_size = queue_stats["queue_size"] - current_trainer_version = queue_stats["current_param_version"] - - version_diff = self.current_param_version - current_trainer_version - - if version_diff > self.staleness_threshold: - if not self.paused: - print( - "[FullyAsyncRollouter][ShouldPause] " - f"due to version_diff > self.staleness_threshold: " - f"rollout_version={self.current_param_version}, " - f"trainer_version={current_trainer_version}, diff={version_diff}" - ) - return True if queue_size >= self.max_queue_size: if not self.paused: diff --git a/recipe/fully_async_policy/message_queue.py b/recipe/fully_async_policy/message_queue.py index 0520ec98034..da1780deb47 100644 --- a/recipe/fully_async_policy/message_queue.py +++ b/recipe/fully_async_policy/message_queue.py @@ -78,13 +78,6 @@ async def put_sample(self, sample: Any, param_version: int) -> bool: bool: Whether the sample was successfully put into the queue """ async with self._lock: - # Check freshness - staleness = self.current_param_version - param_version - if staleness > self.staleness_threshold: - self.dropped_samples += 1 - print(f"Dropped stale sample: staleness={staleness}, threshold={self.staleness_threshold}") - return False - # If queue is full, remove the oldest sample (rarely happens) if len(self.queue) >= self.max_queue_size: self.queue.popleft() From 15b53c8fcf3d7040d8ffc2b1a2ea4ce2ed3eb667 Mon Sep 17 00:00:00 2001 From: hadoop-ai-search Date: Tue, 9 Sep 2025 16:35:06 +0800 Subject: [PATCH 115/182] reset one step --- recipe/fully_async_policy/fsdp_workers.py | 1 + 1 file changed, 1 insertion(+) diff --git a/recipe/fully_async_policy/fsdp_workers.py b/recipe/fully_async_policy/fsdp_workers.py index 086f109e434..dd941c26684 100644 --- a/recipe/fully_async_policy/fsdp_workers.py +++ b/recipe/fully_async_policy/fsdp_workers.py @@ -100,6 +100,7 @@ def sync_rollout_weights(self): collective.broadcast(tensor, src_rank=0, group_name="actor_rollout") if self._is_rollout: inference_model.load_weights([(key, tensor)]) + get_torch_device().empty_cache() @register(dispatch_mode=Dispatch.ONE_TO_ALL) def get_actor_weights_info(self): From 5249bcda3a7765173ba202e59662c6af6e9b5895 Mon Sep 17 00:00:00 2001 From: hadoop-ai-search Date: Tue, 9 Sep 2025 17:15:51 +0800 Subject: [PATCH 116/182] unchange protobuf --- .../unittest/test_protocol_split_merge.py | 621 ------------------ verl/protocol.py | 166 +---- 2 files changed, 2 insertions(+), 785 deletions(-) delete mode 100644 recipe/fully_async_policy/unittest/test_protocol_split_merge.py diff --git a/recipe/fully_async_policy/unittest/test_protocol_split_merge.py b/recipe/fully_async_policy/unittest/test_protocol_split_merge.py deleted file mode 100644 index a5c61f11ba6..00000000000 --- a/recipe/fully_async_policy/unittest/test_protocol_split_merge.py +++ /dev/null @@ -1,621 +0,0 @@ -# Copyright 2025 Meituan Ltd. and/or its affiliates -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import numpy as np -import torch -from tensordict import TensorDict - -from verl.protocol import DataProto, DataProtoItem - - -def create_sample_dataproto(): - """Create a DataProto similar to the provided example.""" - - # Create tensor data similar to the example - batch_size = 12 - - # Tensor data - attention_mask = torch.ones(batch_size, 3072, dtype=torch.int64) - input_ids = torch.randint(0, 32000, (batch_size, 3072), dtype=torch.int64) - position_ids = torch.arange(3072).unsqueeze(0).repeat(batch_size, 1).long() - prompts = torch.randint(0, 32000, (batch_size, 1024), dtype=torch.int64) - response_mask = torch.ones(batch_size, 2048, dtype=torch.int64) - responses = torch.randint(0, 32000, (batch_size, 2048), dtype=torch.int64) - - # Non-tensor data similar to the example - data_source = np.array(["openai/gsm8k"] * batch_size, dtype=object) - ability = np.array(["math"] * batch_size, dtype=object) - - reward_model = np.array( - [ - {"ground_truth": "6", "style": "rule"}, - {"ground_truth": "6", "style": "rule"}, - {"ground_truth": "220000", "style": "rule"}, - {"ground_truth": "277", "style": "rule"}, - {"ground_truth": "277", "style": "rule"}, - {"ground_truth": "35", "style": "rule"}, - {"ground_truth": "6", "style": "rule"}, - {"ground_truth": "220000", "style": "rule"}, - {"ground_truth": "220000", "style": "rule"}, - {"ground_truth": "277", "style": "rule"}, - {"ground_truth": "35", "style": "rule"}, - {"ground_truth": "35", "style": "rule"}, - ], - dtype=object, - ) - - extra_info = np.array( - [ - {"answer": "Answer 1", "index": 4570, "question": "Question 1", "split": "train"}, - {"answer": "Answer 1", "index": 4570, "question": "Question 1", "split": "train"}, - {"answer": "Answer 2", "index": 460, "question": "Question 2", "split": "train"}, - {"answer": "Answer 3", "index": 6613, "question": "Question 3", "split": "train"}, - {"answer": "Answer 3", "index": 6613, "question": "Question 3", "split": "train"}, - {"answer": "Answer 4", "index": 1421, "question": "Question 4", "split": "train"}, - {"answer": "Answer 1", "index": 4570, "question": "Question 1", "split": "train"}, - {"answer": "Answer 2", "index": 460, "question": "Question 2", "split": "train"}, - {"answer": "Answer 2", "index": 460, "question": "Question 2", "split": "train"}, - {"answer": "Answer 3", "index": 6613, "question": "Question 3", "split": "train"}, - {"answer": "Answer 4", "index": 1421, "question": "Question 4", "split": "train"}, - {"answer": "Answer 4", "index": 1421, "question": "Question 4", "split": "train"}, - ], - dtype=object, - ) - - uid = np.array( - [ - "80ae1835-a8db-4faa-8b42-2ffa2ca63f28", - "80ae1835-a8db-4faa-8b42-2ffa2ca63f28", - "cc529271-c2ba-4fe1-a16e-50c5f090538d", - "237ea082-350f-4193-b9a2-3a153a3a38b9", - "237ea082-350f-4193-b9a2-3a153a3a38b9", - "fab3e910-67b3-4653-bc69-377250049267", - "80ae1835-a8db-4faa-8b42-2ffa2ca63f28", - "cc529271-c2ba-4fe1-a16e-50c5f090538d", - "cc529271-c2ba-4fe1-a16e-50c5f090538d", - "237ea082-350f-4193-b9a2-3a153a3a38b9", - "fab3e910-67b3-4653-bc69-377250049267", - "fab3e910-67b3-4653-bc69-377250049267", - ], - dtype=object, - ) - - tools_kwargs = np.array([{}] * batch_size, dtype=object) - interaction_kwargs = np.array([{}] * batch_size, dtype=object) - index = np.array([4570, 4570, 460, 6613, 6613, 1421, 4570, 460, 460, 6613, 1421, 1421], dtype=object) - - # Create DataProto - data_proto = DataProto.from_dict( - tensors={ - "attention_mask": attention_mask, - "input_ids": input_ids, - "position_ids": position_ids, - "prompts": prompts, - "response_mask": response_mask, - "responses": responses, - }, - non_tensors={ - "data_source": data_source, - "ability": ability, - "reward_model": reward_model, - "extra_info": extra_info, - "uid": uid, - "tools_kwargs": tools_kwargs, - "interaction_kwargs": interaction_kwargs, - "index": index, - }, - meta_info={"global_token_num": [2141, 2141, 2161, 2151, 2151, 2130, 2141, 2161, 2161, 2151, 2130, 2130]}, - ) - - return data_proto - - -def test_basic_split_and_merge(): - """Test basic split and merge functionality.""" - print("=== Testing Basic Split and Merge ===") - - # Create sample data - original_proto = create_sample_dataproto() - original_length = len(original_proto) - - print(f"Original DataProto length: {original_length}") - print(f"Original tensor keys: {list(original_proto.batch.keys())}") - print(f"Original non_tensor keys: {list(original_proto.non_tensor_batch.keys())}") - - # Test split - items = original_proto.to_items() - - print(f"Split into {len(items)} items") - assert len(items) == original_length, f"Expected {original_length} items, got {len(items)}" - - # Verify individual items - for i, item in enumerate(items): - print(f"Item {i}: batch_size={item.batch.batch_size}, non_tensor keys={list(item.non_tensor_batch.keys())}") - - # Check that tensor shapes are correct (no batch dimension) - assert item.batch.batch_size == torch.Size([]), ( - f"Item {i} should have empty batch_size, got {item.batch.batch_size}" - ) - - # Check tensor shapes - assert item.batch["attention_mask"].shape == torch.Size([3072]), ( - f"Unexpected attention_mask shape: {item.batch['attention_mask'].shape}" - ) - assert item.batch["input_ids"].shape == torch.Size([3072]), ( - f"Unexpected input_ids shape: {item.batch['input_ids'].shape}" - ) - assert item.batch["prompts"].shape == torch.Size([1024]), ( - f"Unexpected prompts shape: {item.batch['prompts'].shape}" - ) - - # Check non-tensor data types - assert isinstance(item.non_tensor_batch["data_source"], str), ( - f"data_source should be str, got {type(item.non_tensor_batch['data_source'])}" - ) - assert isinstance(item.non_tensor_batch["reward_model"], dict), ( - f"reward_model should be dict, got {type(item.non_tensor_batch['reward_model'])}" - ) - assert isinstance(item.non_tensor_batch["extra_info"], dict), ( - f"extra_info should be dict, got {type(item.non_tensor_batch['extra_info'])}" - ) - - # Test merge - merged_proto = DataProto.from_items(items) - - print(f"Merged DataProto length: {len(merged_proto)}") - assert len(merged_proto) == original_length, f"Merged length should be {original_length}, got {len(merged_proto)}" - - # Verify tensor data consistency - for key in original_proto.batch.keys(): - original_tensor = original_proto.batch[key] - merged_tensor = merged_proto.batch[key] - - assert original_tensor.shape == merged_tensor.shape, ( - f"Shape mismatch for {key}: {original_tensor.shape} vs {merged_tensor.shape}" - ) - assert torch.equal(original_tensor, merged_tensor), f"Tensor data mismatch for {key}" - - # Verify non-tensor data consistency - for key in original_proto.non_tensor_batch.keys(): - original_array = original_proto.non_tensor_batch[key] - merged_array = merged_proto.non_tensor_batch[key] - - assert original_array.shape == merged_array.shape, ( - f"Shape mismatch for {key}: {original_array.shape} vs {merged_array.shape}" - ) - assert np.array_equal(original_array, merged_array), f"Non-tensor data mismatch for {key}" - - # Verify meta_info consistency - assert original_proto.meta_info == merged_proto.meta_info, "Meta info mismatch" - - print("✓ Basic split and merge test passed!") - - -def test_individual_item_access(): - """Test accessing individual items matches split results.""" - print("\n=== Testing Individual Item Access ===") - - original_proto = create_sample_dataproto() - items = original_proto.to_items() - - # Compare direct indexing with split results - for i in range(len(original_proto)): - direct_item = original_proto[i] - split_item = items[i] - - # Check tensor data - for key in original_proto.batch.keys(): - assert torch.equal(direct_item.batch[key], split_item.batch[key]), ( - f"Tensor mismatch at index {i}, key {key}" - ) - - # Check non-tensor data - for key in original_proto.non_tensor_batch.keys(): - if isinstance(direct_item.non_tensor_batch[key], np.ndarray): - assert np.array_equal(direct_item.non_tensor_batch[key], split_item.non_tensor_batch[key]), ( - f"Non-tensor mismatch at index {i}, key {key}" - ) - else: - assert direct_item.non_tensor_batch[key] == split_item.non_tensor_batch[key], ( - f"Non-tensor mismatch at index {i}, key {key}" - ) - - print("✓ Individual item access test passed!") - - -def test_partial_merge(): - """Test merging a subset of items.""" - print("\n=== Testing Partial Merge ===") - - original_proto = create_sample_dataproto() - items = original_proto.to_items() - - # Take a subset of items - subset_indices = [0, 2, 4, 7, 9] - subset_items = [items[i] for i in subset_indices] - - # Merge the subset - subset_proto = DataProto.from_items(subset_items) - - assert len(subset_proto) == len(subset_indices), ( - f"Subset length should be {len(subset_indices)}, got {len(subset_proto)}" - ) - - # Verify the subset contains correct data - for i, original_idx in enumerate(subset_indices): - # Compare with original data at original_idx - for key in original_proto.batch.keys(): - expected_tensor = original_proto.batch[key][original_idx] - actual_tensor = subset_proto.batch[key][i] - assert torch.equal(expected_tensor, actual_tensor), f"Subset tensor mismatch at {i}, key {key}" - - for key in original_proto.non_tensor_batch.keys(): - expected_value = original_proto.non_tensor_batch[key][original_idx] - actual_value = subset_proto.non_tensor_batch[key][i] - - if isinstance(expected_value, np.ndarray): - assert np.array_equal(expected_value, actual_value), f"Subset non-tensor mismatch at {i}, key {key}" - else: - assert expected_value == actual_value, f"Subset non-tensor mismatch at {i}, key {key}" - - print("✓ Partial merge test passed!") - - -def test_item_processing(): - """Test processing individual items before merging.""" - print("\n=== Testing Item Processing ===") - - original_proto = create_sample_dataproto() - items = original_proto.to_items() - - # Process each item (e.g., add a prefix to uid) - processed_items = [] - for i, item in enumerate(items): - processed_item = item.copy() # Create a copy to avoid modifying original - - # Modify some data - processed_item.non_tensor_batch["uid"] = f"processed_{i}_{processed_item.non_tensor_batch['uid']}" - processed_item.non_tensor_batch["processing_step"] = i - processed_item.meta_info["processed"] = True - - processed_items.append(processed_item) - - # Merge processed items - processed_proto = DataProto.from_items(processed_items) - - # Verify processing was applied - for i in range(len(processed_proto)): - expected_uid = f"processed_{i}_{items[i].non_tensor_batch['uid']}" - actual_uid = processed_proto.non_tensor_batch["uid"][i] - assert actual_uid == expected_uid, ( - f"Processing failed for uid at {i}: expected {expected_uid}, got {actual_uid}" - ) - - expected_step = i - actual_step = processed_proto.non_tensor_batch["processing_step"][i] - assert actual_step == expected_step, ( - f"Processing step mismatch at {i}: expected {expected_step}, got {actual_step}" - ) - - # assert processed_proto.meta_info.get("processed") == True, "Meta info processing failed" - - print("✓ Item processing test passed!") - - -def test_error_conditions(): - """Test error conditions.""" - print("\n=== Testing Error Conditions ===") - - # Test empty list - try: - DataProto.from_items([]) - except ValueError as e: - print(f"✓ Correctly caught empty list error: {e}") - - # Test inconsistent structure - try: - # Create items with different tensor keys - original_proto = create_sample_dataproto() - items = original_proto.to_items() - - # Modify one item to have different keys - modified_item = items[1].copy() - modified_item.batch = TensorDict({"different_key": torch.randn(3072)}, batch_size=torch.Size([])) - - inconsistent_items = [items[0], modified_item] - DataProto.from_items(inconsistent_items) - except ValueError as e: - print(f"✓ Correctly caught inconsistent structure error: {e}") - - print("✓ Error conditions test passed!") - - -def test_roundtrip_integrity(): - """Test multiple split/merge cycles maintain data integrity.""" - print("\n=== Testing Roundtrip Integrity ===") - - original_proto = create_sample_dataproto() - current_proto = original_proto - - # Perform multiple split/merge cycles - for cycle in range(3): - print(f"Cycle {cycle + 1}") - - # Split - items = current_proto.to_items() - - # Merge - current_proto = DataProto.from_items(items) - - # Verify integrity - assert len(current_proto) == len(original_proto), f"Length changed in cycle {cycle + 1}" - - for key in original_proto.batch.keys(): - assert torch.equal(original_proto.batch[key], current_proto.batch[key]), ( - f"Tensor {key} changed in cycle {cycle + 1}" - ) - - for key in original_proto.non_tensor_batch.keys(): - assert np.array_equal(original_proto.non_tensor_batch[key], current_proto.non_tensor_batch[key]), ( - f"Non-tensor {key} changed in cycle {cycle + 1}" - ) - - assert original_proto.meta_info == current_proto.meta_info, f"Meta info changed in cycle {cycle + 1}" - - print("✓ Roundtrip integrity test passed!") - - -def run_visual_comparison(): - """Run a visual comparison similar to the user's example.""" - print("\n=== Visual Comparison (Like User Example) ===") - - original_proto = create_sample_dataproto() - - print("Original DataProto:") - print(f"batch_size: {original_proto.batch.batch_size}") - print(f"tensor keys: {list(original_proto.batch.keys())}") - print(f"non_tensor keys: {list(original_proto.non_tensor_batch.keys())}") - print(f"Sample data_source: {original_proto.non_tensor_batch['data_source'][:3]}") - print(f"Sample uid: {original_proto.non_tensor_batch['uid'][:3]}") - - print("\n" + "=" * 50) - print("============= SPLIT =============") - print("=" * 50) - - items = original_proto.to_items() - - # Show first few items - for i in range(min(3, len(items))): - print(f"\nDataProtoItem {i}:") - print(f"batch_size: {items[i].batch.batch_size}") - print(f"attention_mask shape: {items[i].batch['attention_mask'].shape}") - print(f"input_ids shape: {items[i].batch['input_ids'].shape}") - print(f"data_source: {items[i].non_tensor_batch['data_source']}") - print(f"uid: {items[i].non_tensor_batch['uid']}") - print(f"reward_model: {items[i].non_tensor_batch['reward_model']}") - print("-" * 30) - - print("\n" + "=" * 50) - print("============= MERGE =============") - print("=" * 50) - - merged_proto = DataProto.from_items(items) - - print("Merged DataProto:") - print(f"batch_size: {merged_proto.batch.batch_size}") - print(f"tensor keys: {list(merged_proto.batch.keys())}") - print(f"non_tensor keys: {list(merged_proto.non_tensor_batch.keys())}") - print(f"Sample data_source: {merged_proto.non_tensor_batch['data_source'][:3]}") - print(f"Sample uid: {merged_proto.non_tensor_batch['uid'][:3]}") - - # Verify they're identical - success = True - try: - for key in original_proto.batch.keys(): - assert torch.equal(original_proto.batch[key], merged_proto.batch[key]) - for key in original_proto.non_tensor_batch.keys(): - assert np.array_equal(original_proto.non_tensor_batch[key], merged_proto.non_tensor_batch[key]) - assert original_proto.meta_info == merged_proto.meta_info - print("\n✓ Original and merged DataProto are identical!") - except Exception as e: - print(f"\n✗ Verification failed: {e}") - success = False - - return success - - -def example_basic_split_merge(): - """Basic example of splitting DataProto into DataProtoItems and merging back.""" - print("=== Basic Split and Merge Example ===") - - # Create sample data - batch_size = 3 - seq_len = 5 - - # Create tensors - input_ids = torch.randint(0, 1000, (batch_size, seq_len)) - attention_mask = torch.ones(batch_size, seq_len) - - # Create non-tensor data - prompts = np.array(["Hello world", "How are you?", "Good morning"], dtype=object) - scores = np.array([0.8, 0.9, 0.7], dtype=object) - - # Create DataProto - data_proto = DataProto.from_dict( - tensors={"input_ids": input_ids, "attention_mask": attention_mask}, - non_tensors={"prompts": prompts, "scores": scores}, - meta_info={"model_name": "test_model", "version": "1.0"}, - ) - - print(f"Original DataProto length: {len(data_proto)}") - print(f"Input IDs shape: {data_proto.batch['input_ids'].shape}") - print(f"Prompts: {data_proto.non_tensor_batch['prompts']}") - - # Split into DataProtoItems - items = data_proto.to_items() - print(f"\nSplit into {len(items)} items") - - for i, item in enumerate(items): - print(f"Item {i}:") - print(f" Input IDs shape: {item.batch['input_ids'].shape}") - print(f" Prompt: {item.non_tensor_batch['prompts']}") - print(f" Score: {item.non_tensor_batch['scores']}") - - # Merge back to DataProto - merged_proto = DataProto.from_items(items) - print(f"\nMerged DataProto length: {len(merged_proto)}") - print(f"Merged Input IDs shape: {merged_proto.batch['input_ids'].shape}") - print(f"Merged prompts: {merged_proto.non_tensor_batch['prompts']}") - - # Verify they're identical - assert torch.equal(data_proto.batch["input_ids"], merged_proto.batch["input_ids"]) - assert torch.equal(data_proto.batch["attention_mask"], merged_proto.batch["attention_mask"]) - assert np.array_equal(data_proto.non_tensor_batch["prompts"], merged_proto.non_tensor_batch["prompts"]) - assert np.array_equal(data_proto.non_tensor_batch["scores"], merged_proto.non_tensor_batch["scores"]) - - print("\n✓ Original and merged DataProto are identical!") - - -def example_item_processing(): - """Example showing individual item processing before merging.""" - print("\n=== Individual Item Processing Example ===") - - # Create initial data - # batch_size = 4 - - values = torch.tensor([1.0, 2.0, 3.0, 4.0]).unsqueeze(1) # Shape: (4, 1) - labels = np.array(["A", "B", "C", "D"], dtype=object) - - original_proto = DataProto.from_dict( - tensors={"values": values}, non_tensors={"labels": labels}, meta_info={"processing_step": 0} - ) - - print(f"Original values: {original_proto.batch['values'].flatten()}") - print(f"Original labels: {original_proto.non_tensor_batch['labels']}") - - # Split and process each item individually - items = original_proto.to_items() - processed_items = [] - - for i, item in enumerate(items): - # Process the tensor data (multiply by 2) - processed_value = item.batch["values"] * 2 - - # Process the non-tensor data (add suffix) - processed_label = item.non_tensor_batch["labels"] + f"_processed_{i}" - - # Create new processed item - processed_item = DataProtoItem( - batch=item.batch.clone(), # Clone the TensorDict - non_tensor_batch=item.non_tensor_batch.copy(), - meta_info=item.meta_info.copy(), - ) - - # Update with processed data - processed_item.batch["values"] = processed_value - processed_item.non_tensor_batch["labels"] = processed_label - processed_item.meta_info["processing_step"] = 1 - - processed_items.append(processed_item) - - print(f"Processed item {i}: value={processed_value.item()}, label='{processed_label}'") - - # Merge processed items back - processed_proto = DataProto.from_items(processed_items) - - print(f"\nProcessed values: {processed_proto.batch['values'].flatten()}") - print(f"Processed labels: {processed_proto.non_tensor_batch['labels']}") - print(f"Processing step: {processed_proto.meta_info['processing_step']}") - - -def example_convenience_methods(): - """Example showing convenience methods.""" - print("\n=== Convenience Methods Example ===") - - # Create a single DataProtoItem - single_tensor = torch.tensor([42]).unsqueeze(0) # Shape: (1,) - single_item = DataProtoItem( - batch=None, # We'll create TensorDict manually - non_tensor_batch={"text": "Hello"}, - meta_info={"source": "manual"}, - ) - - # Create TensorDict manually for the single item - from tensordict import TensorDict - - single_item.batch = TensorDict({"data": single_tensor}, batch_size=(1,)) - - print(f"Single item data: {single_item.batch['data']}") - print(f"Single item text: {single_item.non_tensor_batch['text']}") - - # Convert single item to DataProto using convenience method - single_proto = single_item.to_proto() - print(f"Converted to DataProto length: {len(single_proto)}") - - # Create multiple items and use static convenience method - items = [single_item] - for i in range(2): - new_item = single_item.copy() # Use the copy method - new_item.batch["data"] = torch.tensor([100 + i]).unsqueeze(0) - new_item.non_tensor_batch["text"] = f"Item {i + 1}" - items.append(new_item) - - # Use DataProtoItem.from_items() convenience method - merged_proto = DataProtoItem.from_items(items) - print(f"Merged using convenience method - length: {len(merged_proto)}") - print(f"Data: {merged_proto.batch['data'].flatten()}") - print(f"Texts: {merged_proto.non_tensor_batch['text']}") - - -def example_error_handling(): - """Example showing error handling.""" - print("\n=== Error Handling Example ===") - - # Try to create DataProto from empty list - try: - DataProto.from_items([]) - print("ERROR: Should have raised exception for empty list") - except ValueError as e: - print(f"✓ Correctly caught error for empty list: {e}") - - # Try to merge items with inconsistent structure - try: - item1 = DataProtoItem( - batch=TensorDict({"data": torch.tensor([1]).unsqueeze(0)}, batch_size=(1,)), - non_tensor_batch={"text": "Hello"}, - ) - item2 = DataProtoItem( - batch=TensorDict({"different_key": torch.tensor([2]).unsqueeze(0)}, batch_size=(1,)), - non_tensor_batch={"text": "World"}, - ) - - DataProto.from_items([item1, item2]) - print("ERROR: Should have raised exception for inconsistent structure") - except ValueError as e: - print(f"✓ Correctly caught error for inconsistent structure: {e}") - - -if __name__ == "__main__": - # Run all tests - test_basic_split_and_merge() - test_individual_item_access() - test_partial_merge() - test_item_processing() - test_error_conditions() - test_roundtrip_integrity() - example_basic_split_merge() - example_item_processing() - example_convenience_methods() - example_error_handling() - run_visual_comparison() diff --git a/verl/protocol.py b/verl/protocol.py index 17b3b10c1f6..a4d394af97d 100644 --- a/verl/protocol.py +++ b/verl/protocol.py @@ -38,7 +38,7 @@ from verl.utils.py_functional import union_two_dict from verl.utils.torch_functional import allgather_dict_tensors -__all__ = ["DataProto", "DataProtoItem", "union_tensor_dict"] +__all__ = ["DataProto", "union_tensor_dict"] with contextlib.suppress(Exception): tensordict.set_lazy_legacy(False).set() @@ -198,83 +198,11 @@ def collate_fn(x: list["DataProtoItem"]): @dataclass class DataProtoItem: - """ - A single item from a DataProto batch, representing one sample. - This is typically used when accessing individual elements from a DataProto. - """ - + # TODO(zhangchi.usc1992) add consistency check batch: TensorDict = None non_tensor_batch: dict = field(default_factory=dict) meta_info: dict = field(default_factory=dict) - def __post_init__(self): - """Perform consistency checking after initialization.""" - self._check_consistency() - - def _check_consistency(self): - """Check the consistency of the DataProtoItem.""" - # For DataProtoItem, batch can have no batch dimension (batch_size=[]) or batch size 1 - if self.batch is not None: - # Allow both cases: tensors without batch dim (batch_size=[]) and tensors with batch size 1 - if hasattr(self.batch, "batch_size") and len(self.batch.batch_size) > 0: - if self.batch.batch_size[0] > 1: - raise ValueError( - f"DataProtoItem batch should have batch size 0 or 1, got {self.batch.batch_size[0]}" - ) - - # Check non_tensor_batch consistency - if self.non_tensor_batch: - for key, val in self.non_tensor_batch.items(): - # For DataProtoItem, non_tensor values should be individual items, not arrays - if isinstance(val, np.ndarray) and val.shape != (): - # Allow only scalar numpy arrays (shape=()) for individual items - if val.shape[0] > 1: - raise ValueError( - f"DataProtoItem non_tensor_batch['{key}']" - "should be a single item, got array with shape {val.shape}" - ) - - def to_proto(self) -> "DataProto": - """Convert this DataProtoItem to a DataProto with batch size 1. - - Returns: - DataProto: A DataProto containing this single item - """ - return DataProto.from_items([self]) - - @staticmethod - def from_items(items: list["DataProtoItem"]) -> "DataProto": - """Create a DataProto from a list of DataProtoItem objects. - - This is a convenience method that calls DataProto.from_items(). - - Args: - items (List[DataProtoItem]): A list of DataProtoItem objects to merge - - Returns: - DataProto: A new DataProto containing all the items as a batch - """ - return DataProto.from_items(items) - - def copy(self) -> "DataProtoItem": - """Create a deep copy of this DataProtoItem. - - Returns: - DataProtoItem: A deep copy of this item - """ - import copy - - # Deep copy the batch TensorDict - batch_copy = copy.deepcopy(self.batch) if self.batch is not None else None - - # Deep copy non_tensor_batch - non_tensor_copy = copy.deepcopy(self.non_tensor_batch) - - # Deep copy meta_info - meta_info_copy = copy.deepcopy(self.meta_info) - - return DataProtoItem(batch=batch_copy, non_tensor_batch=non_tensor_copy, meta_info=meta_info_copy) - @dataclass class DataProto: @@ -810,96 +738,6 @@ def split(self, split_size: int) -> list["DataProto"]: """ return [self[i : i + split_size] for i in range(0, len(self), split_size)] - def to_items(self) -> list["DataProtoItem"]: - """Convert DataProto to a list of DataProtoItem objects. - - Returns: - List[DataProtoItem]: A list containing individual DataProtoItem objects, - one for each sample in the batch - """ - items = [] - for i in range(len(self)): - # Use the existing __getitem__ implementation for single integer access - items.append(self[i]) - return items - - @staticmethod - def from_items(items: list["DataProtoItem"]) -> "DataProto": - """Create a DataProto from a list of DataProtoItem objects. - - Args: - items (List[DataProtoItem]): A list of DataProtoItem objects to merge - - Returns: - DataProto: A new DataProto containing all the items as a batch - - Raises: - ValueError: If the input list is empty or items have inconsistent structure - """ - if not items: - raise ValueError("Cannot create DataProto from empty list of items") - - # Get the first item to determine structure and meta_info - first_item = items[0] - meta_info = first_item.meta_info - - # Collect all tensor batches - batch_tensors = {} - non_tensor_batches = {} - - # Process tensor data - if first_item.batch is not None: - # Get all keys from the first item's batch - tensor_keys = list(first_item.batch.keys()) - - for key in tensor_keys: - tensor_list = [] - for i, item in enumerate(items): - if item.batch is None or key not in item.batch: - raise ValueError(f"Item {i} missing tensor key '{key}' in batch") - - tensor = item.batch[key] - # Handle tensors from DataProtoItem which may not have batch dimension - # (as shown in the user's example where batch_size=torch.Size([])) - if tensor.dim() == 0: - # Scalar tensor - add batch dimension - tensor = tensor.unsqueeze(0) - else: - # Multi-dimensional tensor without batch dimension - add batch dimension - tensor = tensor.unsqueeze(0) - - tensor_list.append(tensor) - - # Concatenate tensors along batch dimension - if tensor_list: - batch_tensors[key] = torch.cat(tensor_list, dim=0) - - # Process non-tensor data - if first_item.non_tensor_batch: - non_tensor_keys = list(first_item.non_tensor_batch.keys()) - - for key in non_tensor_keys: - non_tensor_list = [] - for i, item in enumerate(items): - if key not in item.non_tensor_batch: - raise ValueError(f"Item {i} missing non_tensor key '{key}'") - - non_tensor_data = item.non_tensor_batch[key] - non_tensor_list.append(non_tensor_data) - - # Stack non-tensor data - if non_tensor_list: - non_tensor_batches[key] = np.array(non_tensor_list, dtype=object) - - # Create TensorDict for batch - if batch_tensors: - batch_size = len(items) - batch = TensorDict(source=batch_tensors, batch_size=(batch_size,)) - else: - batch = None - - return DataProto(batch=batch, non_tensor_batch=non_tensor_batches, meta_info=meta_info) - @staticmethod def concat(data: list["DataProto"]) -> "DataProto": """Concat a list of DataProto. The batch is concatenated among dim=0. From 174e762a05963a13fc16c9ca7d31928539df064f Mon Sep 17 00:00:00 2001 From: ArronHZG Date: Tue, 9 Sep 2025 20:53:08 +0800 Subject: [PATCH 117/182] move shell --- .../fully_async_policy/agent_loop/__init__.py | 22 + .../agent_loop/agent_loop.py | 637 ++++++++++++++++++ .../partial_single_turn_agent_loop.py | 74 ++ .../{ => shell}/dapo_7b_math_fsdp2_2_6.sh | 2 +- .../{ => shell}/dapo_7b_math_fsdp2_4_12.sh | 0 .../{ => shell}/dapo_7b_math_fsdp2_8_8.sh | 0 .../dapo_7b_math_fsdp2_colocate.sh | 0 .../{ => shell}/dapo_7b_math_fsdp2_server.sh | 0 .../dapo_7b_math_megatron_colocate.sh | 0 .../{ => shell}/runtime_env.yaml | 0 10 files changed, 734 insertions(+), 1 deletion(-) create mode 100644 recipe/fully_async_policy/agent_loop/__init__.py create mode 100644 recipe/fully_async_policy/agent_loop/agent_loop.py create mode 100644 recipe/fully_async_policy/agent_loop/partial_single_turn_agent_loop.py rename recipe/fully_async_policy/{ => shell}/dapo_7b_math_fsdp2_2_6.sh (99%) rename recipe/fully_async_policy/{ => shell}/dapo_7b_math_fsdp2_4_12.sh (100%) rename recipe/fully_async_policy/{ => shell}/dapo_7b_math_fsdp2_8_8.sh (100%) rename recipe/fully_async_policy/{ => shell}/dapo_7b_math_fsdp2_colocate.sh (100%) rename recipe/fully_async_policy/{ => shell}/dapo_7b_math_fsdp2_server.sh (100%) rename recipe/fully_async_policy/{ => shell}/dapo_7b_math_megatron_colocate.sh (100%) rename recipe/fully_async_policy/{ => shell}/runtime_env.yaml (100%) diff --git a/recipe/fully_async_policy/agent_loop/__init__.py b/recipe/fully_async_policy/agent_loop/__init__.py new file mode 100644 index 00000000000..284f3e975c0 --- /dev/null +++ b/recipe/fully_async_policy/agent_loop/__init__.py @@ -0,0 +1,22 @@ +# Copyright 2025 Meituan Ltd. and/or its affiliates +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from .agent_loop import AgentLoopBase, AgentLoopManager +from .single_turn_agent_loop import SingleTurnAgentLoop +from .partial_single_turn_agent_loop import PartialSingleTurnAgentLoop +from .tool_agent_loop import ToolAgentLoop + +_ = [SingleTurnAgentLoop, ToolAgentLoop, PartialSingleTurnAgentLoop] + +__all__ = ["AgentLoopBase", "AgentLoopManager"] diff --git a/recipe/fully_async_policy/agent_loop/agent_loop.py b/recipe/fully_async_policy/agent_loop/agent_loop.py new file mode 100644 index 00000000000..32d52df8804 --- /dev/null +++ b/recipe/fully_async_policy/agent_loop/agent_loop.py @@ -0,0 +1,637 @@ +# Copyright 2025 Meituan Ltd. and/or its affiliates +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import asyncio +import heapq +import logging +import os +import random +from abc import ABC, abstractmethod +from typing import Any, Optional + +import hydra +import numpy as np +import ray +import torch +from cachetools import LRUCache +from omegaconf import DictConfig, OmegaConf +from pydantic import BaseModel +from tensordict import TensorDict +from transformers import AutoTokenizer + +from verl.protocol import DataProto +from verl.single_controller.ray.base import RayWorkerGroup +from verl.utils import hf_tokenizer +from verl.utils.fs import copy_to_local +from verl.utils.rollout_trace import RolloutTraceConfig, rollout_trace_attr, rollout_trace_op +from verl.workers.rollout.async_server import async_server_class + +logger = logging.getLogger(__file__) +logger.setLevel(os.getenv("VERL_LOGGING_LEVEL", "WARN")) + + +class AsyncLLMServerManager: + """ + A class to manage multiple OpenAI compatible LLM servers. This class provides + - Load balance: least requests load balancing + - Sticky session: send multi-turn chat completions to same server for automatic prefix caching + """ + + def __init__(self, config: DictConfig, server_handles: list[ray.actor.ActorHandle], max_cache_size: int = 10000): + """Initialize the AsyncLLMServerManager. + + Args: + config (DictConfig): YAML config. + server_handles (List[ray.actor.ActorHandle]): OpenAI compatible LLM server actor handles. + max_cache_size (int, optional): max cache size for request_id to server mapping. Defaults to 10000. + """ + self.config = config + self.server_handles = server_handles + random.shuffle(self.server_handles) + + # Least requests load balancing + self.weighted_serveres = [[0, (hash(server), server)] for server in server_handles] + heapq.heapify(self.weighted_serveres) + + # LRU cache to map request_id to server + self.request_id_to_server = LRUCache(maxsize=max_cache_size) + + def _choose_server(self, request_id: str) -> ray.actor.ActorHandle: + # TODO: implement server pressure awareness load balancing + if request_id in self.request_id_to_server: + return self.request_id_to_server[request_id] + + server = self.weighted_serveres[0][1][1] + self.weighted_serveres[0][0] += 1 + heapq.heapreplace(self.weighted_serveres, self.weighted_serveres[0]) + self.request_id_to_server[request_id] = server + return server + + @rollout_trace_op + async def generate( + self, + request_id, + *, + prompt_ids: list[int], + sampling_params: dict[str, Any], + ) -> list[int]: + """Generate tokens from prompt ids. + + Args: + request_id (str): request id for sticky session. + prompt_ids (List[int]): List of prompt token ids. + sampling_params (Dict[str, Any]): Sampling parameters for the chat completion. + + Returns: + List[int]: List of generated token ids. + """ + server = self._choose_server(request_id) + output = await server.generate.remote( + request_id=request_id, + prompt_ids=prompt_ids, + sampling_params=sampling_params, + ) + return output + + async def generate_for_partial(self, request_id, prompt_ids, sampling_params): + """Generate tokens from prompt ids. with partial rollout function""" + server = self._choose_server(request_id) + output = await server.generate_for_partial.remote( + request_id=request_id, + prompt_ids=prompt_ids, + sampling_params=sampling_params, + ) + return output + + +class AgentLoopMetrics(BaseModel): + """Agent loop performance metrics.""" + + generate_sequences: float = 0.0 + tool_calls: float = 0.0 + + +class AgentLoopOutput(BaseModel): + """Agent loop output.""" + + prompt_ids: list[int] + """Prompt token ids.""" + response_ids: list[int] + """Response token ids including LLM generated token, tool response token.""" + response_mask: list[int] + """Response mask, 1 for LLM generated token, 0 for tool response token.""" + num_turns: int = 0 + """Number of chat turns, including user, assistant, tool.""" + metrics: AgentLoopMetrics + """Auxiliary performance metrics""" + is_cancel: bool = False + """Indicates whether the request was interrupted""" + log_probs: list[float] = None + """Response token log probs including LLM generated token, tool response token.""" + + +# make hydra.utils.instantiate happy +class _DummyConfig: + def __init__(self, config: DictConfig) -> None: + self.config = config + + +class AgentLoopBase(ABC): + """An agent loop takes a input message, chat with OpenAI compatible LLM server and interact with various + environments.""" + + _class_initialized = False + + def __init__( + self, trainer_config: _DummyConfig, server_manager: AsyncLLMServerManager, tokenizer: AutoTokenizer, **kwargs + ): + """Initialize agent loop, each sample will have its own loop instance. + + Args: + trainer_config (_DummyConfig): trainer config. + server_manager (AsyncLLMServerManager): OpenAI compatible LLM server manager. + tokenizer (AutoTokenizer): Tokenizer for tokenize messages. + """ + self.init_class(trainer_config.config, tokenizer, **kwargs) + self.config = trainer_config.config + self.server_manager = server_manager + self.tokenizer = tokenizer + self.loop = asyncio.get_running_loop() + + @classmethod + def init_class(cls, config: DictConfig, tokenizer: AutoTokenizer, **kwargs): + """This is used to do heavy initialization work that should shared across all instances. It's only called once. + + Args: + config (DictConfig): trainer config. + tokenizer (AutoTokenizer): Tokenizer for tokenize messages. + **kwargs: extra kwargs from config file passed in by `hydra.utils.instantiate`. + """ + if cls._class_initialized: + return + cls._class_initialized = True + + @abstractmethod + async def run( + self, messages: list[dict[str, Any]], sampling_params: dict[str, Any], partial_output: Optional[AgentLoopOutput] + ) -> AgentLoopOutput: + """Run agent loop to interact with LLM server and environment. + + Args: + messages (List[Dict[str, Any]]): Input messages. + sampling_params (Dict[str, Any]): LLM sampling params. + partial_output: Optional[AgentLoopOutput]: already rollout result. + + Returns: + AgentLoopOutput: Agent loop output. + """ + raise NotImplementedError + + +def postprocess_agent_loop_outputs(inputs: list[AgentLoopOutput], tokenizer, config) -> DataProto: + """Static method to postprocess a list of AgentLoopOutput into DataProto + + Args: + inputs: List of AgentLoopOutput + tokenizer: Tokenizer instance + config: Configuration object + + Returns: + DataProto: Processed batch data + """ + # NOTE: consistent with batch version of generate_sequences in vllm_rollout_spmd.py + # prompts: left pad + # responses: right pad + # input_ids: prompt + response + # attention_mask: [0,0,0,0,1,1,1,1, | 1,1,1,0,0,0,0,0] + # position_ids: [0,0,0,0,0,1,2,3, | 4,5,6,7,8,9,10,11] + + # prompts + tokenizer.padding_side = "left" + outputs = tokenizer.pad( + [{"input_ids": input.prompt_ids} for input in inputs], + padding="max_length", + max_length=config.actor_rollout_ref.rollout.prompt_length, + return_tensors="pt", + return_attention_mask=True, + ) + prompt_ids, prompt_attention_mask = outputs["input_ids"], outputs["attention_mask"] + + # responses + tokenizer.padding_side = "right" + outputs = tokenizer.pad( + [{"input_ids": input.response_ids} for input in inputs], + padding="max_length", + max_length=config.actor_rollout_ref.rollout.response_length, + return_tensors="pt", + return_attention_mask=True, + ) + response_ids, response_attention_mask = outputs["input_ids"], outputs["attention_mask"] + + # response_mask + outputs = tokenizer.pad( + [{"input_ids": input.response_mask} for input in inputs], + padding="max_length", + max_length=config.actor_rollout_ref.rollout.response_length, + return_tensors="pt", + return_attention_mask=False, + ) + response_mask = outputs["input_ids"] + assert response_ids.shape == response_mask.shape, ( + f"mismatch in response_ids and response_mask shape: {response_ids.shape} vs {response_mask.shape}" + ) + response_mask = response_mask * response_attention_mask + + input_ids = torch.cat([prompt_ids, response_ids], dim=1) + attention_mask = torch.cat([prompt_attention_mask, response_attention_mask], dim=1) + position_ids = (attention_mask.cumsum(dim=1) - 1) * attention_mask + + batch = TensorDict( + { + "prompts": prompt_ids, # [bsz, prompt_length] + "responses": response_ids, # [bsz, response_length] + "response_mask": response_mask, # [bsz, response_length] + "input_ids": input_ids, # [bsz, prompt_length + response_length] + "attention_mask": attention_mask, # [bsz, prompt_length + response_length] + "position_ids": position_ids, # [bsz, prompt_length + response_length] + }, + batch_size=len(input_ids), + ) + + num_turns = np.array([input.num_turns for input in inputs], dtype=np.int32) + metrics = [input.metrics.model_dump() for input in inputs] + return DataProto(batch=batch, non_tensor_batch={"__num_turns__": num_turns}, meta_info={"metrics": metrics}) + + +@ray.remote +class AgentLoopWorker: + """Agent loop worker takes a batch of messages and run each message in an agent loop.""" + + def __init__(self, config: DictConfig, server_handles: list[ray.actor.ActorHandle]): + """Initialize agent loop manager. + + Args: + config (DictConfig): YAML config. + server_handles (List[ray.actor.ActorHandle]): OpenAI compatible LLM server actor handles. + """ + self.config = config + self.server_manager = AsyncLLMServerManager(config, server_handles) + + model_path = config.actor_rollout_ref.model.path + self.model_name = "/".join(model_path.split("/")[-2:]) + local_path = copy_to_local(config.actor_rollout_ref.model.path) + self.tokenizer = hf_tokenizer(local_path, trust_remote_code=True) + + agent_loop_config_path = config.actor_rollout_ref.rollout.agent.agent_loop_config_path + if agent_loop_config_path: + agent_loop_configs = OmegaConf.load(agent_loop_config_path) + for agent_loop_config in agent_loop_configs: + _agent_loop_registry[agent_loop_config.name] = agent_loop_config + + trace_config = config.trainer.get("rollout_trace", {}) + trace_config = self.config.actor_rollout_ref.rollout.get("trace", {}) + RolloutTraceConfig.init( + self.config.trainer.project_name, + self.config.trainer.experiment_name, + trace_config.get("backend"), + trace_config.get("token2text", False), + ) + + async def generate_sequences(self, batch: DataProto) -> DataProto: + """Generate sequences from agent loop. + + Args: + batch (DataProto): Input batch. + + Returns: + DataProto: Output batch. + - prompts: [bsz, prompt_length], prompt token ids from dataset. + - responses: [bsz, response_length], output token ids include response tokens + from LLM generation and observation tokens from tool_calls. + - response_mask: [bsz, response_length], 1 for LLM generated tokens, 0 for observation/padding tokens. + - input_ids: [bsz, prompt_length + response_length], whole sequence token ids, including prompt tokens + and response tokens. + - attention_mask: [bsz, prompt_length + response_length], 0 for padding tokens, 1 for other tokens. + - position_ids: [bsz, prompt_length + response_length], incremental position ids. + + For multi-turn conversations: + responses: |<- LLM generation ->|<- tool_calls ->|<- LLM generation ->|<- padding ->| + response_mask: | 1, 1, 1, ..., 1, 1 | 0, 0, .., 0, 0 | 1, 1, 1, ..., 1, 1 | 0, 0, ..., 0| + """ + config = self.config.actor_rollout_ref.rollout + sampling_params = dict( + temperature=config.temperature, + top_p=config.top_p, + repetition_penalty=1.0, + ) + + # override sampling params for validation + if batch.meta_info.get("validate", False): + sampling_params["top_p"] = config.val_kwargs.top_p + sampling_params["temperature"] = config.val_kwargs.temperature + + # by default, we assume it's a single turn agent + if "agent_name" not in batch.non_tensor_batch: + batch.non_tensor_batch["agent_name"] = np.array(["single_turn_agent"] * len(batch), dtype=object) + + tasks = [] + agent_names = batch.non_tensor_batch["agent_name"] + raw_prompts = batch.non_tensor_batch["raw_prompt"] + if "index" in batch.non_tensor_batch: + index = batch.non_tensor_batch["index"] + else: + index = np.arange(len(raw_prompts)) + + trajectory_info = await get_trajectory_info( + batch.meta_info.get("global_steps", -1), index, batch.meta_info.get("validate", False) + ) + + for agent_name, messages, trajectory in zip(agent_names, raw_prompts, trajectory_info, strict=True): + tasks.append( + asyncio.create_task(self._run_agent_loop(agent_name, messages.tolist(), sampling_params, trajectory)) + ) + outputs = await asyncio.gather(*tasks) + + output = postprocess_agent_loop_outputs(outputs, self.tokenizer, self.config) + return output + + async def generate_sequences_no_post( + self, batch: DataProto, partial_output_list: Optional[list[AgentLoopOutput]] + ) -> list[AgentLoopOutput]: + """Generate sequences from agent loop. + + Args: + batch (DataProto): Input batch. + partial_output_list: Optional[List[AgentLoopOutput]]: already rollout result. + + Returns: + list[AgentLoopOutput]: List of agent loop outputs, one per sample in the batch. + Each AgentLoopOutput contains: + - prompt_ids: prompt token ids + - response_ids: response token ids including LLM generated and tool response tokens + - response_mask: 1 for LLM generated tokens, 0 for tool response tokens + - num_turns: number of chat turns + - metrics: performance metrics + """ + config = self.config.actor_rollout_ref.rollout + sampling_params = dict( + temperature=config.temperature, + top_p=config.top_p, + repetition_penalty=1.0, + ) + + # override sampling params for validation + if batch.meta_info.get("validate", False): + sampling_params["top_p"] = config.val_kwargs.top_p + sampling_params["temperature"] = config.val_kwargs.temperature + + # by default, we assume it's a single turn agent + if "agent_name" not in batch.non_tensor_batch: + batch.non_tensor_batch["agent_name"] = np.array(["single_turn_agent"] * len(batch), dtype=object) + + tasks = [] + agent_names = batch.non_tensor_batch["agent_name"] + raw_prompts = batch.non_tensor_batch["raw_prompt"] + if "index" in batch.non_tensor_batch: + index = batch.non_tensor_batch["index"] + else: + index = np.arange(len(raw_prompts)) + + trajectory_info = await get_trajectory_info( + batch.meta_info.get("global_steps", -1), index, batch.meta_info.get("validate", False) + ) + if not partial_output_list: + partial_output_list = [None] * len(batch) + + for agent_name, messages, trajectory, partial_output in zip( + agent_names, raw_prompts, trajectory_info, partial_output_list, strict=True + ): + tasks.append( + asyncio.create_task( + self._run_agent_loop(agent_name, messages.tolist(), sampling_params, trajectory, partial_output) + ) + ) + outputs = await asyncio.gather(*tasks) + + return outputs + + async def _run_agent_loop( + self, + agent_name: str, + messages: list[dict[str, Any]], + sampling_params: dict[str, Any], + trajectory: dict[str, Any], + partial_output: Optional[AgentLoopOutput] = None, + ) -> AgentLoopOutput: + with rollout_trace_attr( + step=trajectory["step"], + sample_index=trajectory["sample_index"], + rollout_n=trajectory["rollout_n"], + validate=trajectory["validate"], + name="agent_loop", + ): + assert agent_name in _agent_loop_registry, ( + f"Agent loop {agent_name} not registered, registered agent loops: {_agent_loop_registry.keys()}" + ) + agent_loop_config = _agent_loop_registry[agent_name] + agent_loop = hydra.utils.instantiate( + config=agent_loop_config, + trainer_config=_DummyConfig(config=self.config), + server_manager=self.server_manager, + tokenizer=self.tokenizer, + ) + output = await agent_loop.run(messages, sampling_params, partial_output) + return output + + +class AgentLoopManager: + """Agent loop manager that manages a group of agent loop workers.""" + + def __init__(self, config: DictConfig, worker_group: RayWorkerGroup): + """Initialize agent loop manager. + + Args: + config (DictConfig): trainer config. + worker_group (RayWorkerGroup): AsyncActorRolloutRefWorker worker group. + """ + self.config = config + self.worker_group = worker_group + + self._initialize_llm_servers() + self._init_agent_loop_workers() + + # Initially we're in sleep mode. + self.sleep() + + def _initialize_llm_servers(self): + self.rollout_tp_size = self.config.actor_rollout_ref.rollout.tensor_model_parallel_size + self.rollout_dp_size = self.worker_group.world_size // self.rollout_tp_size + + register_center = ray.get_actor(f"{self.worker_group.name_prefix}_register_center") + workers_info = ray.get(register_center.get_worker_info.remote()) + assert len(workers_info) == self.worker_group.world_size + + self.async_llm_servers = [None] * self.rollout_dp_size + self.server_addresses = [None] * self.rollout_dp_size + + if self.config.actor_rollout_ref.rollout.agent.custom_async_server: + server_class = async_server_class( + rollout_backend=self.config.actor_rollout_ref.rollout.name, + rollout_backend_module=self.config.actor_rollout_ref.rollout.agent.custom_async_server.path, + rollout_backend_class=self.config.actor_rollout_ref.rollout.agent.custom_async_server.name, + ) + else: + server_class = async_server_class(rollout_backend=self.config.actor_rollout_ref.rollout.name) + + # Start all server instances, restart if address already in use. + unready_dp_ranks = set(range(self.rollout_dp_size)) + while len(unready_dp_ranks) > 0: + servers = { + rollout_dp_rank: server_class.options( + # make sure AsyncvLLMServer colocates with its corresponding workers + scheduling_strategy=ray.util.scheduling_strategies.NodeAffinitySchedulingStrategy( + node_id=workers_info[rollout_dp_rank * self.rollout_tp_size], + soft=False, + ), + name=f"async_llm_server_{rollout_dp_rank}", + ).remote(self.config, self.rollout_dp_size, rollout_dp_rank, self.worker_group.name_prefix) + for rollout_dp_rank in unready_dp_ranks + } + + for rollout_dp_rank, server in servers.items(): + try: + address = ray.get(server.get_server_address.remote()) + self.server_addresses[rollout_dp_rank] = address + self.async_llm_servers[rollout_dp_rank] = server + unready_dp_ranks.remove(rollout_dp_rank) + except Exception: + ray.kill(server) + print(f"rollout server {rollout_dp_rank} failed, maybe address already in use, restarting...") + + # All server instances are ready, init AsyncLLM engine. + ray.get([server.init_engine.remote() for server in self.async_llm_servers]) + + def _init_agent_loop_workers(self): + self.agent_loop_workers = [] + # 获取建议的资源配置 + agent_config = self.config.actor_rollout_ref.rollout.agent + max_concurrency = agent_config.get("max_concurrency", 10) + num_cpus = agent_config.get("num_cpus", 2) # 默认2个CPU核心 + + for i in range(agent_config.num_workers): + self.agent_loop_workers.append( + AgentLoopWorker.options( + name=f"agent_loop_worker_{i}", + max_concurrency=max_concurrency, # 设置最大并发数 + num_cpus=num_cpus, # 设置CPU资源需求 + ).remote(self.config, self.async_llm_servers) + ) + + def generate_sequences(self, prompts: DataProto) -> DataProto: + """Split input batch and dispatch to agent loop workers. + + Args: + prompts (DataProto): Input batch. + + Returns: + DataProto: Output batch. + """ + if self.config.actor_rollout_ref.rollout.free_cache_engine: + self.wake_up() + chunkes = prompts.chunk(len(self.agent_loop_workers)) + outputs = ray.get( + [ + worker.generate_sequences.remote(chunk) + for worker, chunk in zip(self.agent_loop_workers, chunkes, strict=True) + ] + ) + output = DataProto.concat(outputs) + if self.config.actor_rollout_ref.rollout.free_cache_engine: + self.sleep() + + # calculate performance metrics + metrics = [output.meta_info["metrics"] for output in outputs] # List[List[Dict[str, str]]] + timing = self._performance_metrics(metrics, output) + + output.meta_info = {"timing": timing} + return output + + async def generate_single_sample_async( + self, + sample: DataProto, + partial_output_list: Optional[list[AgentLoopOutput]], + ) -> list[AgentLoopOutput]: + """ + 异步处理单个样本, 需要复制n次 + + Args: + sample: 单个样本数据 + partial_output_list: Optional[List[AgentLoopOutput]]: already rollout result. + + Returns: + tuple[AgentLoopOutput, float]: 处理结果和处理时间 + """ + # 使用负载均衡选择 worker + worker = self._select_best_worker() + # 异步处理单个样本 - 使用无后处理版本获取原始AgentLoopOutput + output_future = worker.generate_sequences_no_post.remote(sample, partial_output_list) + return await asyncio.wrap_future(output_future.future()) + + def _select_best_worker(self): + """选择最佳的 worker(简单的轮询负载均衡)""" + if not hasattr(self, "_worker_index"): + self._worker_index = 0 + + worker = self.agent_loop_workers[self._worker_index] + self._worker_index = (self._worker_index + 1) % len(self.agent_loop_workers) + return worker + + def _performance_metrics(self, metrics: list[list[dict[str, str]]], output: DataProto) -> dict[str, float]: + timing = {} + t_generate_sequences = np.array([metric["generate_sequences"] for chunk in metrics for metric in chunk]) + t_tool_calls = np.array([metric["tool_calls"] for chunk in metrics for metric in chunk]) + timing["agent_loop/generate_sequences/min"] = t_generate_sequences.min() + timing["agent_loop/generate_sequences/max"] = t_generate_sequences.max() + timing["agent_loop/generate_sequences/mean"] = t_generate_sequences.mean() + timing["agent_loop/tool_calls/min"] = t_tool_calls.min() + timing["agent_loop/tool_calls/max"] = t_tool_calls.max() + timing["agent_loop/tool_calls/mean"] = t_tool_calls.mean() + + # batch sequence generation is bounded by the slowest sample + slowest = np.argmax(t_generate_sequences + t_tool_calls) + attention_mask = output.batch["attention_mask"][slowest] + prompt_length = output.batch["prompts"].shape[1] + timing["agent_loop/slowest/generate_sequences"] = t_generate_sequences[slowest] + timing["agent_loop/slowest/tool_calls"] = t_tool_calls[slowest] + timing["agent_loop/slowest/prompt_length"] = attention_mask[:prompt_length].sum().item() + timing["agent_loop/slowest/response_length"] = attention_mask[prompt_length:].sum().item() + + return timing + + def wake_up(self): + """Wake up all rollout server instances.""" + ray.get([server.wake_up.remote() for server in self.async_llm_servers]) + + def sleep(self): + """Sleep all rollout server instances.""" + ray.get([server.sleep.remote() for server in self.async_llm_servers]) + + async def cancel_async(self): + """Cancel all rollout tasks asynchronously.""" + futures = [server.cancel.remote() for server in self.async_llm_servers] + await asyncio.gather(*[asyncio.wrap_future(future.future()) for future in futures], return_exceptions=True) + + async def resume_async(self): + """Cancel all rollout tasks asynchronously.""" + futures = [server.resume.remote() for server in self.async_llm_servers] + await asyncio.gather(*[asyncio.wrap_future(future.future()) for future in futures], return_exceptions=True) diff --git a/recipe/fully_async_policy/agent_loop/partial_single_turn_agent_loop.py b/recipe/fully_async_policy/agent_loop/partial_single_turn_agent_loop.py new file mode 100644 index 00000000000..ccdb9084238 --- /dev/null +++ b/recipe/fully_async_policy/agent_loop/partial_single_turn_agent_loop.py @@ -0,0 +1,74 @@ +# Copyright 2025 Meituan Ltd. and/or its affiliates +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import logging +import os +from typing import Any, Optional +from uuid import uuid4 + +from verl.experimental.agent_loop.agent_loop import AgentLoopBase, AgentLoopOutput, register +from verl.utils.profiler import simple_timer + +logger = logging.getLogger(__file__) +logger.setLevel(os.getenv("VERL_LOGGING_LEVEL", "WARN")) + + +@register("partial_single_turn_agent") +class PartialSingleTurnAgentLoop(AgentLoopBase): + """Naive agent loop that only do single turn chat completion.""" + + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + self.prompt_length = self.config.actor_rollout_ref.rollout.prompt_length + self.response_length = self.config.actor_rollout_ref.rollout.response_length + + async def run( + self, messages: list[dict[str, Any]], sampling_params: dict[str, Any], output: Optional[AgentLoopOutput] + ) -> AgentLoopOutput: + if not output: + prompt_ids = await self.loop.run_in_executor( + None, lambda: self.tokenizer.apply_chat_template(messages, add_generation_prompt=True, tokenize=True) + ) + else: + if output.is_cancel: + # 恢复暂停的样本,结果直接添加到 prompt_ids 后面 + prompt_ids = output.prompt_ids + output.response_ids + else: + # 同一批样本,部分cancel,部分没有cancel, 没有cancel的样本直接返回 + return output + + metrics = {} + request_id = uuid4().hex + with simple_timer("generate_sequences", metrics): + response_ids, log_probs, is_cancel = await self.server_manager.generate_for_partial( + request_id=request_id, prompt_ids=prompt_ids, sampling_params=sampling_params + ) + + if not output: + response_mask = [1] * len(response_ids) + # 暂停待恢复样本, 把输出结果加到 response_ids 后,并重置 response_mask + else: + prompt_ids = output.prompt_ids + log_probs = output.log_probs + log_probs + response_ids = output.response_ids + response_ids + response_mask = [1] * len(response_ids) + + return AgentLoopOutput( + prompt_ids=prompt_ids, + response_ids=response_ids[: self.response_length], + response_mask=response_mask[: self.response_length], + num_turns=2, + metrics=metrics, + is_cancel=is_cancel, + log_probs=log_probs, + ) diff --git a/recipe/fully_async_policy/dapo_7b_math_fsdp2_2_6.sh b/recipe/fully_async_policy/shell/dapo_7b_math_fsdp2_2_6.sh similarity index 99% rename from recipe/fully_async_policy/dapo_7b_math_fsdp2_2_6.sh rename to recipe/fully_async_policy/shell/dapo_7b_math_fsdp2_2_6.sh index 5f654227d15..0d303bdde87 100644 --- a/recipe/fully_async_policy/dapo_7b_math_fsdp2_2_6.sh +++ b/recipe/fully_async_policy/shell/dapo_7b_math_fsdp2_2_6.sh @@ -155,7 +155,7 @@ $PYTHON_INTERPRETER -m recipe.fully_async_policy.fully_async_main \ trainer.logger=['console','tensorboard'] \ trainer.project_name="${project_name}" \ trainer.experiment_name="${exp_name}" \ - trainer.val_before_train=True \ + trainer.val_before_train=False \ trainer.save_freq=-1 \ trainer.default_local_dir="${CKPTS_DIR}" \ trainer.resume_mode=auto \ diff --git a/recipe/fully_async_policy/dapo_7b_math_fsdp2_4_12.sh b/recipe/fully_async_policy/shell/dapo_7b_math_fsdp2_4_12.sh similarity index 100% rename from recipe/fully_async_policy/dapo_7b_math_fsdp2_4_12.sh rename to recipe/fully_async_policy/shell/dapo_7b_math_fsdp2_4_12.sh diff --git a/recipe/fully_async_policy/dapo_7b_math_fsdp2_8_8.sh b/recipe/fully_async_policy/shell/dapo_7b_math_fsdp2_8_8.sh similarity index 100% rename from recipe/fully_async_policy/dapo_7b_math_fsdp2_8_8.sh rename to recipe/fully_async_policy/shell/dapo_7b_math_fsdp2_8_8.sh diff --git a/recipe/fully_async_policy/dapo_7b_math_fsdp2_colocate.sh b/recipe/fully_async_policy/shell/dapo_7b_math_fsdp2_colocate.sh similarity index 100% rename from recipe/fully_async_policy/dapo_7b_math_fsdp2_colocate.sh rename to recipe/fully_async_policy/shell/dapo_7b_math_fsdp2_colocate.sh diff --git a/recipe/fully_async_policy/dapo_7b_math_fsdp2_server.sh b/recipe/fully_async_policy/shell/dapo_7b_math_fsdp2_server.sh similarity index 100% rename from recipe/fully_async_policy/dapo_7b_math_fsdp2_server.sh rename to recipe/fully_async_policy/shell/dapo_7b_math_fsdp2_server.sh diff --git a/recipe/fully_async_policy/dapo_7b_math_megatron_colocate.sh b/recipe/fully_async_policy/shell/dapo_7b_math_megatron_colocate.sh similarity index 100% rename from recipe/fully_async_policy/dapo_7b_math_megatron_colocate.sh rename to recipe/fully_async_policy/shell/dapo_7b_math_megatron_colocate.sh diff --git a/recipe/fully_async_policy/runtime_env.yaml b/recipe/fully_async_policy/shell/runtime_env.yaml similarity index 100% rename from recipe/fully_async_policy/runtime_env.yaml rename to recipe/fully_async_policy/shell/runtime_env.yaml From 085f36795661f7d53fc0085a31fd85771e462dc6 Mon Sep 17 00:00:00 2001 From: ArronHZG Date: Tue, 9 Sep 2025 21:32:27 +0800 Subject: [PATCH 118/182] rm agent_loop --- .../fully_async_policy/agent_loop/__init__.py | 22 - .../agent_loop/agent_loop.py | 637 ------------------ .../partial_single_turn_agent_loop.py | 74 -- 3 files changed, 733 deletions(-) delete mode 100644 recipe/fully_async_policy/agent_loop/__init__.py delete mode 100644 recipe/fully_async_policy/agent_loop/agent_loop.py delete mode 100644 recipe/fully_async_policy/agent_loop/partial_single_turn_agent_loop.py diff --git a/recipe/fully_async_policy/agent_loop/__init__.py b/recipe/fully_async_policy/agent_loop/__init__.py deleted file mode 100644 index 284f3e975c0..00000000000 --- a/recipe/fully_async_policy/agent_loop/__init__.py +++ /dev/null @@ -1,22 +0,0 @@ -# Copyright 2025 Meituan Ltd. and/or its affiliates -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -from .agent_loop import AgentLoopBase, AgentLoopManager -from .single_turn_agent_loop import SingleTurnAgentLoop -from .partial_single_turn_agent_loop import PartialSingleTurnAgentLoop -from .tool_agent_loop import ToolAgentLoop - -_ = [SingleTurnAgentLoop, ToolAgentLoop, PartialSingleTurnAgentLoop] - -__all__ = ["AgentLoopBase", "AgentLoopManager"] diff --git a/recipe/fully_async_policy/agent_loop/agent_loop.py b/recipe/fully_async_policy/agent_loop/agent_loop.py deleted file mode 100644 index 32d52df8804..00000000000 --- a/recipe/fully_async_policy/agent_loop/agent_loop.py +++ /dev/null @@ -1,637 +0,0 @@ -# Copyright 2025 Meituan Ltd. and/or its affiliates -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -import asyncio -import heapq -import logging -import os -import random -from abc import ABC, abstractmethod -from typing import Any, Optional - -import hydra -import numpy as np -import ray -import torch -from cachetools import LRUCache -from omegaconf import DictConfig, OmegaConf -from pydantic import BaseModel -from tensordict import TensorDict -from transformers import AutoTokenizer - -from verl.protocol import DataProto -from verl.single_controller.ray.base import RayWorkerGroup -from verl.utils import hf_tokenizer -from verl.utils.fs import copy_to_local -from verl.utils.rollout_trace import RolloutTraceConfig, rollout_trace_attr, rollout_trace_op -from verl.workers.rollout.async_server import async_server_class - -logger = logging.getLogger(__file__) -logger.setLevel(os.getenv("VERL_LOGGING_LEVEL", "WARN")) - - -class AsyncLLMServerManager: - """ - A class to manage multiple OpenAI compatible LLM servers. This class provides - - Load balance: least requests load balancing - - Sticky session: send multi-turn chat completions to same server for automatic prefix caching - """ - - def __init__(self, config: DictConfig, server_handles: list[ray.actor.ActorHandle], max_cache_size: int = 10000): - """Initialize the AsyncLLMServerManager. - - Args: - config (DictConfig): YAML config. - server_handles (List[ray.actor.ActorHandle]): OpenAI compatible LLM server actor handles. - max_cache_size (int, optional): max cache size for request_id to server mapping. Defaults to 10000. - """ - self.config = config - self.server_handles = server_handles - random.shuffle(self.server_handles) - - # Least requests load balancing - self.weighted_serveres = [[0, (hash(server), server)] for server in server_handles] - heapq.heapify(self.weighted_serveres) - - # LRU cache to map request_id to server - self.request_id_to_server = LRUCache(maxsize=max_cache_size) - - def _choose_server(self, request_id: str) -> ray.actor.ActorHandle: - # TODO: implement server pressure awareness load balancing - if request_id in self.request_id_to_server: - return self.request_id_to_server[request_id] - - server = self.weighted_serveres[0][1][1] - self.weighted_serveres[0][0] += 1 - heapq.heapreplace(self.weighted_serveres, self.weighted_serveres[0]) - self.request_id_to_server[request_id] = server - return server - - @rollout_trace_op - async def generate( - self, - request_id, - *, - prompt_ids: list[int], - sampling_params: dict[str, Any], - ) -> list[int]: - """Generate tokens from prompt ids. - - Args: - request_id (str): request id for sticky session. - prompt_ids (List[int]): List of prompt token ids. - sampling_params (Dict[str, Any]): Sampling parameters for the chat completion. - - Returns: - List[int]: List of generated token ids. - """ - server = self._choose_server(request_id) - output = await server.generate.remote( - request_id=request_id, - prompt_ids=prompt_ids, - sampling_params=sampling_params, - ) - return output - - async def generate_for_partial(self, request_id, prompt_ids, sampling_params): - """Generate tokens from prompt ids. with partial rollout function""" - server = self._choose_server(request_id) - output = await server.generate_for_partial.remote( - request_id=request_id, - prompt_ids=prompt_ids, - sampling_params=sampling_params, - ) - return output - - -class AgentLoopMetrics(BaseModel): - """Agent loop performance metrics.""" - - generate_sequences: float = 0.0 - tool_calls: float = 0.0 - - -class AgentLoopOutput(BaseModel): - """Agent loop output.""" - - prompt_ids: list[int] - """Prompt token ids.""" - response_ids: list[int] - """Response token ids including LLM generated token, tool response token.""" - response_mask: list[int] - """Response mask, 1 for LLM generated token, 0 for tool response token.""" - num_turns: int = 0 - """Number of chat turns, including user, assistant, tool.""" - metrics: AgentLoopMetrics - """Auxiliary performance metrics""" - is_cancel: bool = False - """Indicates whether the request was interrupted""" - log_probs: list[float] = None - """Response token log probs including LLM generated token, tool response token.""" - - -# make hydra.utils.instantiate happy -class _DummyConfig: - def __init__(self, config: DictConfig) -> None: - self.config = config - - -class AgentLoopBase(ABC): - """An agent loop takes a input message, chat with OpenAI compatible LLM server and interact with various - environments.""" - - _class_initialized = False - - def __init__( - self, trainer_config: _DummyConfig, server_manager: AsyncLLMServerManager, tokenizer: AutoTokenizer, **kwargs - ): - """Initialize agent loop, each sample will have its own loop instance. - - Args: - trainer_config (_DummyConfig): trainer config. - server_manager (AsyncLLMServerManager): OpenAI compatible LLM server manager. - tokenizer (AutoTokenizer): Tokenizer for tokenize messages. - """ - self.init_class(trainer_config.config, tokenizer, **kwargs) - self.config = trainer_config.config - self.server_manager = server_manager - self.tokenizer = tokenizer - self.loop = asyncio.get_running_loop() - - @classmethod - def init_class(cls, config: DictConfig, tokenizer: AutoTokenizer, **kwargs): - """This is used to do heavy initialization work that should shared across all instances. It's only called once. - - Args: - config (DictConfig): trainer config. - tokenizer (AutoTokenizer): Tokenizer for tokenize messages. - **kwargs: extra kwargs from config file passed in by `hydra.utils.instantiate`. - """ - if cls._class_initialized: - return - cls._class_initialized = True - - @abstractmethod - async def run( - self, messages: list[dict[str, Any]], sampling_params: dict[str, Any], partial_output: Optional[AgentLoopOutput] - ) -> AgentLoopOutput: - """Run agent loop to interact with LLM server and environment. - - Args: - messages (List[Dict[str, Any]]): Input messages. - sampling_params (Dict[str, Any]): LLM sampling params. - partial_output: Optional[AgentLoopOutput]: already rollout result. - - Returns: - AgentLoopOutput: Agent loop output. - """ - raise NotImplementedError - - -def postprocess_agent_loop_outputs(inputs: list[AgentLoopOutput], tokenizer, config) -> DataProto: - """Static method to postprocess a list of AgentLoopOutput into DataProto - - Args: - inputs: List of AgentLoopOutput - tokenizer: Tokenizer instance - config: Configuration object - - Returns: - DataProto: Processed batch data - """ - # NOTE: consistent with batch version of generate_sequences in vllm_rollout_spmd.py - # prompts: left pad - # responses: right pad - # input_ids: prompt + response - # attention_mask: [0,0,0,0,1,1,1,1, | 1,1,1,0,0,0,0,0] - # position_ids: [0,0,0,0,0,1,2,3, | 4,5,6,7,8,9,10,11] - - # prompts - tokenizer.padding_side = "left" - outputs = tokenizer.pad( - [{"input_ids": input.prompt_ids} for input in inputs], - padding="max_length", - max_length=config.actor_rollout_ref.rollout.prompt_length, - return_tensors="pt", - return_attention_mask=True, - ) - prompt_ids, prompt_attention_mask = outputs["input_ids"], outputs["attention_mask"] - - # responses - tokenizer.padding_side = "right" - outputs = tokenizer.pad( - [{"input_ids": input.response_ids} for input in inputs], - padding="max_length", - max_length=config.actor_rollout_ref.rollout.response_length, - return_tensors="pt", - return_attention_mask=True, - ) - response_ids, response_attention_mask = outputs["input_ids"], outputs["attention_mask"] - - # response_mask - outputs = tokenizer.pad( - [{"input_ids": input.response_mask} for input in inputs], - padding="max_length", - max_length=config.actor_rollout_ref.rollout.response_length, - return_tensors="pt", - return_attention_mask=False, - ) - response_mask = outputs["input_ids"] - assert response_ids.shape == response_mask.shape, ( - f"mismatch in response_ids and response_mask shape: {response_ids.shape} vs {response_mask.shape}" - ) - response_mask = response_mask * response_attention_mask - - input_ids = torch.cat([prompt_ids, response_ids], dim=1) - attention_mask = torch.cat([prompt_attention_mask, response_attention_mask], dim=1) - position_ids = (attention_mask.cumsum(dim=1) - 1) * attention_mask - - batch = TensorDict( - { - "prompts": prompt_ids, # [bsz, prompt_length] - "responses": response_ids, # [bsz, response_length] - "response_mask": response_mask, # [bsz, response_length] - "input_ids": input_ids, # [bsz, prompt_length + response_length] - "attention_mask": attention_mask, # [bsz, prompt_length + response_length] - "position_ids": position_ids, # [bsz, prompt_length + response_length] - }, - batch_size=len(input_ids), - ) - - num_turns = np.array([input.num_turns for input in inputs], dtype=np.int32) - metrics = [input.metrics.model_dump() for input in inputs] - return DataProto(batch=batch, non_tensor_batch={"__num_turns__": num_turns}, meta_info={"metrics": metrics}) - - -@ray.remote -class AgentLoopWorker: - """Agent loop worker takes a batch of messages and run each message in an agent loop.""" - - def __init__(self, config: DictConfig, server_handles: list[ray.actor.ActorHandle]): - """Initialize agent loop manager. - - Args: - config (DictConfig): YAML config. - server_handles (List[ray.actor.ActorHandle]): OpenAI compatible LLM server actor handles. - """ - self.config = config - self.server_manager = AsyncLLMServerManager(config, server_handles) - - model_path = config.actor_rollout_ref.model.path - self.model_name = "/".join(model_path.split("/")[-2:]) - local_path = copy_to_local(config.actor_rollout_ref.model.path) - self.tokenizer = hf_tokenizer(local_path, trust_remote_code=True) - - agent_loop_config_path = config.actor_rollout_ref.rollout.agent.agent_loop_config_path - if agent_loop_config_path: - agent_loop_configs = OmegaConf.load(agent_loop_config_path) - for agent_loop_config in agent_loop_configs: - _agent_loop_registry[agent_loop_config.name] = agent_loop_config - - trace_config = config.trainer.get("rollout_trace", {}) - trace_config = self.config.actor_rollout_ref.rollout.get("trace", {}) - RolloutTraceConfig.init( - self.config.trainer.project_name, - self.config.trainer.experiment_name, - trace_config.get("backend"), - trace_config.get("token2text", False), - ) - - async def generate_sequences(self, batch: DataProto) -> DataProto: - """Generate sequences from agent loop. - - Args: - batch (DataProto): Input batch. - - Returns: - DataProto: Output batch. - - prompts: [bsz, prompt_length], prompt token ids from dataset. - - responses: [bsz, response_length], output token ids include response tokens - from LLM generation and observation tokens from tool_calls. - - response_mask: [bsz, response_length], 1 for LLM generated tokens, 0 for observation/padding tokens. - - input_ids: [bsz, prompt_length + response_length], whole sequence token ids, including prompt tokens - and response tokens. - - attention_mask: [bsz, prompt_length + response_length], 0 for padding tokens, 1 for other tokens. - - position_ids: [bsz, prompt_length + response_length], incremental position ids. - - For multi-turn conversations: - responses: |<- LLM generation ->|<- tool_calls ->|<- LLM generation ->|<- padding ->| - response_mask: | 1, 1, 1, ..., 1, 1 | 0, 0, .., 0, 0 | 1, 1, 1, ..., 1, 1 | 0, 0, ..., 0| - """ - config = self.config.actor_rollout_ref.rollout - sampling_params = dict( - temperature=config.temperature, - top_p=config.top_p, - repetition_penalty=1.0, - ) - - # override sampling params for validation - if batch.meta_info.get("validate", False): - sampling_params["top_p"] = config.val_kwargs.top_p - sampling_params["temperature"] = config.val_kwargs.temperature - - # by default, we assume it's a single turn agent - if "agent_name" not in batch.non_tensor_batch: - batch.non_tensor_batch["agent_name"] = np.array(["single_turn_agent"] * len(batch), dtype=object) - - tasks = [] - agent_names = batch.non_tensor_batch["agent_name"] - raw_prompts = batch.non_tensor_batch["raw_prompt"] - if "index" in batch.non_tensor_batch: - index = batch.non_tensor_batch["index"] - else: - index = np.arange(len(raw_prompts)) - - trajectory_info = await get_trajectory_info( - batch.meta_info.get("global_steps", -1), index, batch.meta_info.get("validate", False) - ) - - for agent_name, messages, trajectory in zip(agent_names, raw_prompts, trajectory_info, strict=True): - tasks.append( - asyncio.create_task(self._run_agent_loop(agent_name, messages.tolist(), sampling_params, trajectory)) - ) - outputs = await asyncio.gather(*tasks) - - output = postprocess_agent_loop_outputs(outputs, self.tokenizer, self.config) - return output - - async def generate_sequences_no_post( - self, batch: DataProto, partial_output_list: Optional[list[AgentLoopOutput]] - ) -> list[AgentLoopOutput]: - """Generate sequences from agent loop. - - Args: - batch (DataProto): Input batch. - partial_output_list: Optional[List[AgentLoopOutput]]: already rollout result. - - Returns: - list[AgentLoopOutput]: List of agent loop outputs, one per sample in the batch. - Each AgentLoopOutput contains: - - prompt_ids: prompt token ids - - response_ids: response token ids including LLM generated and tool response tokens - - response_mask: 1 for LLM generated tokens, 0 for tool response tokens - - num_turns: number of chat turns - - metrics: performance metrics - """ - config = self.config.actor_rollout_ref.rollout - sampling_params = dict( - temperature=config.temperature, - top_p=config.top_p, - repetition_penalty=1.0, - ) - - # override sampling params for validation - if batch.meta_info.get("validate", False): - sampling_params["top_p"] = config.val_kwargs.top_p - sampling_params["temperature"] = config.val_kwargs.temperature - - # by default, we assume it's a single turn agent - if "agent_name" not in batch.non_tensor_batch: - batch.non_tensor_batch["agent_name"] = np.array(["single_turn_agent"] * len(batch), dtype=object) - - tasks = [] - agent_names = batch.non_tensor_batch["agent_name"] - raw_prompts = batch.non_tensor_batch["raw_prompt"] - if "index" in batch.non_tensor_batch: - index = batch.non_tensor_batch["index"] - else: - index = np.arange(len(raw_prompts)) - - trajectory_info = await get_trajectory_info( - batch.meta_info.get("global_steps", -1), index, batch.meta_info.get("validate", False) - ) - if not partial_output_list: - partial_output_list = [None] * len(batch) - - for agent_name, messages, trajectory, partial_output in zip( - agent_names, raw_prompts, trajectory_info, partial_output_list, strict=True - ): - tasks.append( - asyncio.create_task( - self._run_agent_loop(agent_name, messages.tolist(), sampling_params, trajectory, partial_output) - ) - ) - outputs = await asyncio.gather(*tasks) - - return outputs - - async def _run_agent_loop( - self, - agent_name: str, - messages: list[dict[str, Any]], - sampling_params: dict[str, Any], - trajectory: dict[str, Any], - partial_output: Optional[AgentLoopOutput] = None, - ) -> AgentLoopOutput: - with rollout_trace_attr( - step=trajectory["step"], - sample_index=trajectory["sample_index"], - rollout_n=trajectory["rollout_n"], - validate=trajectory["validate"], - name="agent_loop", - ): - assert agent_name in _agent_loop_registry, ( - f"Agent loop {agent_name} not registered, registered agent loops: {_agent_loop_registry.keys()}" - ) - agent_loop_config = _agent_loop_registry[agent_name] - agent_loop = hydra.utils.instantiate( - config=agent_loop_config, - trainer_config=_DummyConfig(config=self.config), - server_manager=self.server_manager, - tokenizer=self.tokenizer, - ) - output = await agent_loop.run(messages, sampling_params, partial_output) - return output - - -class AgentLoopManager: - """Agent loop manager that manages a group of agent loop workers.""" - - def __init__(self, config: DictConfig, worker_group: RayWorkerGroup): - """Initialize agent loop manager. - - Args: - config (DictConfig): trainer config. - worker_group (RayWorkerGroup): AsyncActorRolloutRefWorker worker group. - """ - self.config = config - self.worker_group = worker_group - - self._initialize_llm_servers() - self._init_agent_loop_workers() - - # Initially we're in sleep mode. - self.sleep() - - def _initialize_llm_servers(self): - self.rollout_tp_size = self.config.actor_rollout_ref.rollout.tensor_model_parallel_size - self.rollout_dp_size = self.worker_group.world_size // self.rollout_tp_size - - register_center = ray.get_actor(f"{self.worker_group.name_prefix}_register_center") - workers_info = ray.get(register_center.get_worker_info.remote()) - assert len(workers_info) == self.worker_group.world_size - - self.async_llm_servers = [None] * self.rollout_dp_size - self.server_addresses = [None] * self.rollout_dp_size - - if self.config.actor_rollout_ref.rollout.agent.custom_async_server: - server_class = async_server_class( - rollout_backend=self.config.actor_rollout_ref.rollout.name, - rollout_backend_module=self.config.actor_rollout_ref.rollout.agent.custom_async_server.path, - rollout_backend_class=self.config.actor_rollout_ref.rollout.agent.custom_async_server.name, - ) - else: - server_class = async_server_class(rollout_backend=self.config.actor_rollout_ref.rollout.name) - - # Start all server instances, restart if address already in use. - unready_dp_ranks = set(range(self.rollout_dp_size)) - while len(unready_dp_ranks) > 0: - servers = { - rollout_dp_rank: server_class.options( - # make sure AsyncvLLMServer colocates with its corresponding workers - scheduling_strategy=ray.util.scheduling_strategies.NodeAffinitySchedulingStrategy( - node_id=workers_info[rollout_dp_rank * self.rollout_tp_size], - soft=False, - ), - name=f"async_llm_server_{rollout_dp_rank}", - ).remote(self.config, self.rollout_dp_size, rollout_dp_rank, self.worker_group.name_prefix) - for rollout_dp_rank in unready_dp_ranks - } - - for rollout_dp_rank, server in servers.items(): - try: - address = ray.get(server.get_server_address.remote()) - self.server_addresses[rollout_dp_rank] = address - self.async_llm_servers[rollout_dp_rank] = server - unready_dp_ranks.remove(rollout_dp_rank) - except Exception: - ray.kill(server) - print(f"rollout server {rollout_dp_rank} failed, maybe address already in use, restarting...") - - # All server instances are ready, init AsyncLLM engine. - ray.get([server.init_engine.remote() for server in self.async_llm_servers]) - - def _init_agent_loop_workers(self): - self.agent_loop_workers = [] - # 获取建议的资源配置 - agent_config = self.config.actor_rollout_ref.rollout.agent - max_concurrency = agent_config.get("max_concurrency", 10) - num_cpus = agent_config.get("num_cpus", 2) # 默认2个CPU核心 - - for i in range(agent_config.num_workers): - self.agent_loop_workers.append( - AgentLoopWorker.options( - name=f"agent_loop_worker_{i}", - max_concurrency=max_concurrency, # 设置最大并发数 - num_cpus=num_cpus, # 设置CPU资源需求 - ).remote(self.config, self.async_llm_servers) - ) - - def generate_sequences(self, prompts: DataProto) -> DataProto: - """Split input batch and dispatch to agent loop workers. - - Args: - prompts (DataProto): Input batch. - - Returns: - DataProto: Output batch. - """ - if self.config.actor_rollout_ref.rollout.free_cache_engine: - self.wake_up() - chunkes = prompts.chunk(len(self.agent_loop_workers)) - outputs = ray.get( - [ - worker.generate_sequences.remote(chunk) - for worker, chunk in zip(self.agent_loop_workers, chunkes, strict=True) - ] - ) - output = DataProto.concat(outputs) - if self.config.actor_rollout_ref.rollout.free_cache_engine: - self.sleep() - - # calculate performance metrics - metrics = [output.meta_info["metrics"] for output in outputs] # List[List[Dict[str, str]]] - timing = self._performance_metrics(metrics, output) - - output.meta_info = {"timing": timing} - return output - - async def generate_single_sample_async( - self, - sample: DataProto, - partial_output_list: Optional[list[AgentLoopOutput]], - ) -> list[AgentLoopOutput]: - """ - 异步处理单个样本, 需要复制n次 - - Args: - sample: 单个样本数据 - partial_output_list: Optional[List[AgentLoopOutput]]: already rollout result. - - Returns: - tuple[AgentLoopOutput, float]: 处理结果和处理时间 - """ - # 使用负载均衡选择 worker - worker = self._select_best_worker() - # 异步处理单个样本 - 使用无后处理版本获取原始AgentLoopOutput - output_future = worker.generate_sequences_no_post.remote(sample, partial_output_list) - return await asyncio.wrap_future(output_future.future()) - - def _select_best_worker(self): - """选择最佳的 worker(简单的轮询负载均衡)""" - if not hasattr(self, "_worker_index"): - self._worker_index = 0 - - worker = self.agent_loop_workers[self._worker_index] - self._worker_index = (self._worker_index + 1) % len(self.agent_loop_workers) - return worker - - def _performance_metrics(self, metrics: list[list[dict[str, str]]], output: DataProto) -> dict[str, float]: - timing = {} - t_generate_sequences = np.array([metric["generate_sequences"] for chunk in metrics for metric in chunk]) - t_tool_calls = np.array([metric["tool_calls"] for chunk in metrics for metric in chunk]) - timing["agent_loop/generate_sequences/min"] = t_generate_sequences.min() - timing["agent_loop/generate_sequences/max"] = t_generate_sequences.max() - timing["agent_loop/generate_sequences/mean"] = t_generate_sequences.mean() - timing["agent_loop/tool_calls/min"] = t_tool_calls.min() - timing["agent_loop/tool_calls/max"] = t_tool_calls.max() - timing["agent_loop/tool_calls/mean"] = t_tool_calls.mean() - - # batch sequence generation is bounded by the slowest sample - slowest = np.argmax(t_generate_sequences + t_tool_calls) - attention_mask = output.batch["attention_mask"][slowest] - prompt_length = output.batch["prompts"].shape[1] - timing["agent_loop/slowest/generate_sequences"] = t_generate_sequences[slowest] - timing["agent_loop/slowest/tool_calls"] = t_tool_calls[slowest] - timing["agent_loop/slowest/prompt_length"] = attention_mask[:prompt_length].sum().item() - timing["agent_loop/slowest/response_length"] = attention_mask[prompt_length:].sum().item() - - return timing - - def wake_up(self): - """Wake up all rollout server instances.""" - ray.get([server.wake_up.remote() for server in self.async_llm_servers]) - - def sleep(self): - """Sleep all rollout server instances.""" - ray.get([server.sleep.remote() for server in self.async_llm_servers]) - - async def cancel_async(self): - """Cancel all rollout tasks asynchronously.""" - futures = [server.cancel.remote() for server in self.async_llm_servers] - await asyncio.gather(*[asyncio.wrap_future(future.future()) for future in futures], return_exceptions=True) - - async def resume_async(self): - """Cancel all rollout tasks asynchronously.""" - futures = [server.resume.remote() for server in self.async_llm_servers] - await asyncio.gather(*[asyncio.wrap_future(future.future()) for future in futures], return_exceptions=True) diff --git a/recipe/fully_async_policy/agent_loop/partial_single_turn_agent_loop.py b/recipe/fully_async_policy/agent_loop/partial_single_turn_agent_loop.py deleted file mode 100644 index ccdb9084238..00000000000 --- a/recipe/fully_async_policy/agent_loop/partial_single_turn_agent_loop.py +++ /dev/null @@ -1,74 +0,0 @@ -# Copyright 2025 Meituan Ltd. and/or its affiliates -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -import logging -import os -from typing import Any, Optional -from uuid import uuid4 - -from verl.experimental.agent_loop.agent_loop import AgentLoopBase, AgentLoopOutput, register -from verl.utils.profiler import simple_timer - -logger = logging.getLogger(__file__) -logger.setLevel(os.getenv("VERL_LOGGING_LEVEL", "WARN")) - - -@register("partial_single_turn_agent") -class PartialSingleTurnAgentLoop(AgentLoopBase): - """Naive agent loop that only do single turn chat completion.""" - - def __init__(self, *args, **kwargs): - super().__init__(*args, **kwargs) - self.prompt_length = self.config.actor_rollout_ref.rollout.prompt_length - self.response_length = self.config.actor_rollout_ref.rollout.response_length - - async def run( - self, messages: list[dict[str, Any]], sampling_params: dict[str, Any], output: Optional[AgentLoopOutput] - ) -> AgentLoopOutput: - if not output: - prompt_ids = await self.loop.run_in_executor( - None, lambda: self.tokenizer.apply_chat_template(messages, add_generation_prompt=True, tokenize=True) - ) - else: - if output.is_cancel: - # 恢复暂停的样本,结果直接添加到 prompt_ids 后面 - prompt_ids = output.prompt_ids + output.response_ids - else: - # 同一批样本,部分cancel,部分没有cancel, 没有cancel的样本直接返回 - return output - - metrics = {} - request_id = uuid4().hex - with simple_timer("generate_sequences", metrics): - response_ids, log_probs, is_cancel = await self.server_manager.generate_for_partial( - request_id=request_id, prompt_ids=prompt_ids, sampling_params=sampling_params - ) - - if not output: - response_mask = [1] * len(response_ids) - # 暂停待恢复样本, 把输出结果加到 response_ids 后,并重置 response_mask - else: - prompt_ids = output.prompt_ids - log_probs = output.log_probs + log_probs - response_ids = output.response_ids + response_ids - response_mask = [1] * len(response_ids) - - return AgentLoopOutput( - prompt_ids=prompt_ids, - response_ids=response_ids[: self.response_length], - response_mask=response_mask[: self.response_length], - num_turns=2, - metrics=metrics, - is_cancel=is_cancel, - log_probs=log_probs, - ) From fa9e103cf36359425222d19471d73c5a71cec09c Mon Sep 17 00:00:00 2001 From: ArronHZG Date: Tue, 9 Sep 2025 22:11:10 +0800 Subject: [PATCH 119/182] refactor agent_loop --- recipe/fully_async_policy/detach_utils.py | 147 +++++++++--------- recipe/fully_async_policy/fully_async_main.py | 5 +- .../fully_async_rollouter.py | 25 ++- .../fully_async_policy/fully_async_trainer.py | 42 ++--- recipe/fully_async_policy/message_queue.py | 2 +- recipe/fully_async_policy/param_sync.py | 7 +- verl/experimental/agent_loop/__init__.py | 2 +- verl/experimental/agent_loop/agent_loop.py | 5 +- .../agent_loop/single_turn_agent_loop.py | 6 +- .../agent_loop/tool_agent_loop.py | 6 +- verl/trainer/ppo/ray_trainer.py | 1 - 11 files changed, 124 insertions(+), 124 deletions(-) diff --git a/recipe/fully_async_policy/detach_utils.py b/recipe/fully_async_policy/detach_utils.py index 127afca6881..18e45d50a16 100644 --- a/recipe/fully_async_policy/detach_utils.py +++ b/recipe/fully_async_policy/detach_utils.py @@ -12,9 +12,9 @@ # See the License for the specific language governing permissions and # limitations under the License. import time -from dataclasses import dataclass -from typing import Any, Optional, Dict, List from collections import defaultdict +from dataclasses import dataclass +from typing import Any, Optional import numpy as np import torch @@ -231,157 +231,160 @@ def assemble_batch_from_rollout_samples( return final_batch + class MetricsAggregator: """Metrics aggregator, used to combine metrics from multiple training steps""" - + def __init__(self, total_gpus: int): # Store all values ​​for each metric - self.metric_values: Dict[str, List[float]] = defaultdict(list) + self.metric_values: dict[str, list[float]] = defaultdict(list) # Store the number of samples at each step for weighted averaging - self.sample_counts: List[int] = [] + self.sample_counts: list[int] = [] # Store the timestamp of each step for time-related calculations - self.timestamps: List[float] = [] + self.timestamps: list[float] = [] # Step Count self.step_count = 0 # total num gpus used self.total_gpus = total_gpus - + # Metric aggregation rule configuration self.aggregation_rules = self._init_aggregation_rules() - - def _init_aggregation_rules(self) -> Dict[str, Dict[str, List[str]]]: + + def _init_aggregation_rules(self) -> dict[str, dict[str, list[str]]]: """Initialize metrics aggregation rules""" return { # Time-Based metrics, can add metrics here - 'time_sum': [ - 'perf/time_per_step' - ], + "time_sum": ["perf/time_per_step"], } - - def add_step_metrics(self, metrics: Dict[str, Any], sample_count: int, timestamp: float = None): + + def add_step_metrics(self, metrics: dict[str, Any], sample_count: int, timestamp: float = None): """Adding a single-step metrics""" if timestamp is None: timestamp = time.time() - + self.sample_counts.append(sample_count) self.timestamps.append(timestamp) self.step_count += 1 - + # Store all metrics values for key, value in metrics.items(): if isinstance(value, (int, float, np.number)): self.metric_values[key].append(float(value)) elif isinstance(value, torch.Tensor): self.metric_values[key].append(float(value.item())) - + def _get_aggregation_type(self, metric_name: str) -> str: """Determine the aggregation type based on the metric name""" for agg_type, metric_list in self.aggregation_rules.items(): if metric_name in metric_list: return agg_type - + metric_lower = metric_name.lower() - if any(keyword in metric_lower for keyword in ['timing_s/']): - return 'time_sum' - if any(keyword in metric_lower for keyword in ['mean', 'avg', 'average']): - return 'avg' - if any(keyword in metric_lower for keyword in ['max', 'maximum']): - return 'max' - if any(keyword in metric_lower for keyword in ['min', 'minimum']): - return 'min' - if any(keyword in metric_lower for keyword in ['sum', 'total']): - return 'sum' - if any(keyword in metric_lower for keyword in ['weighted_avg']): - return 'weighted_avg' - + if any(keyword in metric_lower for keyword in ["timing_s/"]): + return "time_sum" + if any(keyword in metric_lower for keyword in ["mean", "avg", "average"]): + return "avg" + if any(keyword in metric_lower for keyword in ["max", "maximum"]): + return "max" + if any(keyword in metric_lower for keyword in ["min", "minimum"]): + return "min" + if any(keyword in metric_lower for keyword in ["sum", "total"]): + return "sum" + if any(keyword in metric_lower for keyword in ["weighted_avg"]): + return "weighted_avg" + import warnings - warnings.warn(f"No aggregation rule is matched in init_aggregation_rules. \ - For metric {metric_name}, the 'avg' method is used") - return 'avg' - def _aggregate_single_metric(self, metric_name: str, values: List[float]) -> float: + warnings.warn( + f"No aggregation rule is matched in init_aggregation_rules. \ + For metric {metric_name}, the 'avg' method is used" + ) + return "avg" + + def _aggregate_single_metric(self, metric_name: str, values: list[float]) -> float: """Aggregating a single metric""" if not values: return 0.0 - + agg_type = self._get_aggregation_type(metric_name) - - if agg_type == 'last': + + if agg_type == "last": return values[-1] - - elif agg_type == 'weighted_avg': + + elif agg_type == "weighted_avg": # Weighted average if len(values) != len(self.sample_counts): # If the lengths do not match, use a simple average return sum(values) / len(values) - + total_samples = sum(self.sample_counts) if total_samples == 0: return sum(values) / len(values) - - weighted_sum = sum(v * c for v, c in zip(values, self.sample_counts)) + + weighted_sum = sum(v * c for v, c in zip(values, self.sample_counts, strict=False)) return weighted_sum / total_samples - - elif agg_type == 'sum' or agg_type == 'time_sum': + + elif agg_type == "sum" or agg_type == "time_sum": return sum(values) - - elif agg_type == 'avg': + + elif agg_type == "avg": return sum(values) / len(values) - - elif agg_type == 'max': + + elif agg_type == "max": return max(values) - - elif agg_type == 'min': + + elif agg_type == "min": return min(values) - + else: # Default average return sum(values) / len(values) - - def get_aggregated_metrics(self) -> Dict[str, Any]: + + def get_aggregated_metrics(self) -> dict[str, Any]: """aggregated metrics""" t = time.time() if self.step_count == 0: return {} - + aggregated = {} - + # Aggregate all metrics for metric_name, values in self.metric_values.items(): aggregated[metric_name] = self._aggregate_single_metric(metric_name, values) - - # Aggregate special metrics + + # Aggregate special metrics aggregated = self._special_metrics_aggergate(aggregated) print(f"aggregated metrics done. cost {time.time() - t}") - + return aggregated - - def _special_metrics_aggergate(self, aggregated: Dict[str, Any]) -> Dict[str, Any]: + + def _special_metrics_aggergate(self, aggregated: dict[str, Any]) -> dict[str, Any]: """calculate special metrics""" if "global_seqlen/minmax_diff" in aggregated.keys(): aggregated["global_seqlen/minmax_diff"] = aggregated["global_seqlen/max"] - aggregated["global_seqlen/min"] - + REQUIRED_PERF_KEYS = {"perf/throughput", "perf/total_num_tokens", "perf/time_per_step"} if REQUIRED_PERF_KEYS.issubset(aggregated): - aggregated["perf/throughput"] = aggregated['perf/total_num_tokens'] / \ - (aggregated["perf/time_per_step"] * self.total_gpus) - + aggregated["perf/throughput"] = aggregated["perf/total_num_tokens"] / ( + aggregated["perf/time_per_step"] * self.total_gpus + ) + return aggregated - + def reset(self): """Reset Aggregator""" self.metric_values.clear() self.sample_counts.clear() self.timestamps.clear() self.step_count = 0 - - def get_current_stats(self) -> Dict[str, Any]: + + def get_current_stats(self) -> dict[str, Any]: """Get statistics about the current aggregation state (for debugging)""" return { - 'step_count': self.step_count, - 'metric_count': len(self.metric_values), - 'total_samples': sum(self.sample_counts), - 'metric_names': list(self.metric_values.keys()), + "step_count": self.step_count, + "metric_count": len(self.metric_values), + "total_samples": sum(self.sample_counts), + "metric_names": list(self.metric_values.keys()), } diff --git a/recipe/fully_async_policy/fully_async_main.py b/recipe/fully_async_policy/fully_async_main.py index 78fc1784b82..a588679991c 100644 --- a/recipe/fully_async_policy/fully_async_main.py +++ b/recipe/fully_async_policy/fully_async_main.py @@ -218,9 +218,8 @@ def _initialize_components(self, config) -> None: # load checkpoint and sync parameter before doing anything val_before_train = val_reward_fn is not None and config.trainer.get("val_before_train", True) - ray.get(self.components["trainer"].load_checkpoint.remote()) - ray.get(param_synchronizer.sync_weights.remote(version=0, - validate=val_before_train)) + ray.get(self.components["trainer"].load_checkpoint.remote()) + ray.get(param_synchronizer.sync_weights.remote(version=0, validate=val_before_train)) self.components["param_synchronizer"] = param_synchronizer print("[ASYNC MAIN] All components initialized successfully") diff --git a/recipe/fully_async_policy/fully_async_rollouter.py b/recipe/fully_async_policy/fully_async_rollouter.py index c25a52abbe0..4a2a7d7200c 100644 --- a/recipe/fully_async_policy/fully_async_rollouter.py +++ b/recipe/fully_async_policy/fully_async_rollouter.py @@ -13,10 +13,9 @@ # limitations under the License. import asyncio import time -from pprint import pformat, pprint +from pprint import pformat import ray -from omegaconf import OmegaConf from recipe.fully_async_policy.detach_utils import ( RolloutSample, @@ -165,10 +164,11 @@ async def set_required_samples(self, required_samples: int): # 单次最多扔一次更新需要的样本 self.max_concurrent_samples = int( - self.config.actor_rollout_ref.actor.ppo_mini_batch_size - / self.config.actor_rollout_ref.rollout.n - * self.async_rollout_manager.rollout_dp_size * 8 - ) + self.config.actor_rollout_ref.actor.ppo_mini_batch_size + / self.config.actor_rollout_ref.rollout.n + * self.async_rollout_manager.rollout_dp_size + * 8 + ) self.max_concurrent_samples = min(self.max_concurrent_samples, self.max_required_samples) self.max_queue_size = self.max_required_samples @@ -207,16 +207,13 @@ async def update_param_version(self, version: int, validate: bool = False, globa self.val_reward_fn is not None and self.config.rollout.test_freq > 0 and self.current_param_version % self.config.rollout.test_freq == 0 - and self.current_param_version > 0 # don't test here in the initial parameter sync - ) or ( - validate and self.val_reward_fn is not None - ): + and self.current_param_version > 0 # don't test here in the initial parameter sync + ) or (validate and self.val_reward_fn is not None): with marked_timer("testing", timing_raw, color="green"): val_metrics: dict = self._validate() - data = ValidateMetrics(timing_raw=timing_raw, - metrics=val_metrics, - global_steps=global_steps, - param_version=version) + data = ValidateMetrics( + timing_raw=timing_raw, metrics=val_metrics, global_steps=global_steps, param_version=version + ) await self.message_queue_client.put_validate(ray.cloudpickle.dumps(data)) def _validate_config(self): diff --git a/recipe/fully_async_policy/fully_async_trainer.py b/recipe/fully_async_policy/fully_async_trainer.py index 2b549c0b621..0d83a00ba4a 100644 --- a/recipe/fully_async_policy/fully_async_trainer.py +++ b/recipe/fully_async_policy/fully_async_trainer.py @@ -15,6 +15,7 @@ import time import warnings from datetime import datetime +from pprint import pprint from typing import Any import ray @@ -22,9 +23,9 @@ from tqdm import tqdm from recipe.fully_async_policy.detach_utils import ( + MetricsAggregator, ValidateMetrics, assemble_batch_from_rollout_samples, - MetricsAggregator, ) from recipe.fully_async_policy.message_queue import MessageQueueClient from verl.single_controller.ray import RayClassWithInitArgs, RayWorkerGroup @@ -37,7 +38,7 @@ WorkerType, ) from verl.utils.debug import marked_timer -from pprint import pprint + @ray.remote(num_cpus=10) class FullyAsyncTrainer(RayPPOTrainer): @@ -121,8 +122,10 @@ def __init__( self.required_samples = int( self.minimal_bsz * config.actor_rollout_ref.actor.ppo_mini_batch_size / config.actor_rollout_ref.rollout.n ) - total_gpus = config.trainer.nnodes * config.trainer.n_gpus_per_node + \ - config.rollout.nnodes * config.rollout.n_gpus_per_node + total_gpus = ( + config.trainer.nnodes * config.trainer.n_gpus_per_node + + config.rollout.nnodes * config.rollout.n_gpus_per_node + ) self.metrics_aggregator = MetricsAggregator(total_gpus=total_gpus) def set_message_queue_client(self, message_queue_client: MessageQueueClient): @@ -303,9 +306,7 @@ def fit(self): self._collect_metrics(batch, 0, metrics, timing_raw) self.metrics_aggregator.add_step_metrics( - metrics=metrics, - sample_count=self.required_samples, - timestamp=time.time() + metrics=metrics, sample_count=self.required_samples, timestamp=time.time() ) # Trigger parameter synchronization after training step time_str = datetime.now().strftime("%H:%M:%S.%f")[:-3] @@ -321,23 +322,25 @@ def fit(self): val_data: ValidateMetrics = ray.cloudpickle.loads(val_data) self.logger.log(data=val_data.metrics, step=val_data.param_version) self.logger.log(data=val_data.timing_raw, step=val_data.param_version) - pprint(f"[FullyAsyncTrainer] parameter version: {val_data.param_version} \ - Validation metrics: {val_data.metrics}") + pprint( + f"[FullyAsyncTrainer] parameter version: {val_data.param_version} \ + Validation metrics: {val_data.metrics}" + ) self.global_steps += 1 # final parameter sync and validate if val_data is None: - self._trigger_parameter_sync_after_step(validate=True, global_steps=self.global_steps-1) + self._trigger_parameter_sync_after_step(validate=True, global_steps=self.global_steps - 1) ray.get(self.param_synchronizer.wait_last_sync.remote()) val_data = self.message_queue_client.get_validate_sync() if val_data: val_data: ValidateMetrics = ray.cloudpickle.loads(val_data) self.logger.log(data=val_data.metrics, step=val_data.param_version) - self.logger.log(data=val_data.timing_raw, step=val_data.param_version) + self.logger.log(data=val_data.timing_raw, step=val_data.param_version) pprint(f"[FullyAsyncTrainer] Final validation metrics: {val_data.metrics}") self.progress_bar.close() - self._check_save_checkpoint(True, timing_raw) # TODO: 检查checkpoint + self._check_save_checkpoint(True, timing_raw) # TODO: 检查checkpoint def load_checkpoint(self): return self._load_checkpoint() @@ -347,20 +350,21 @@ def _trigger_parameter_sync_after_step(self, validate: bool = False, global_step Trigger parameter synchronization after training step This ensures rollouter always uses the latest trained parameters """ - if self.local_trigger_step < self.trigger_parameter_sync_step and not validate: + if self.local_trigger_step < self.trigger_parameter_sync_step and not validate: self.local_trigger_step += 1 return - self.current_param_version += 1 + self.current_param_version += 1 self.local_trigger_step = 1 self.logger.log( data=self.metrics_aggregator.get_aggregated_metrics(), step=self.current_param_version, - ) + ) self.progress_bar.update(1) self.metrics_aggregator.reset() ray.get(self.param_synchronizer.wait_last_sync.remote()) - ray.get(self.param_synchronizer.sync_weights.remote(self.current_param_version, - validate=validate, - global_steps=global_steps) - ) \ No newline at end of file + ray.get( + self.param_synchronizer.sync_weights.remote( + self.current_param_version, validate=validate, global_steps=global_steps + ) + ) diff --git a/recipe/fully_async_policy/message_queue.py b/recipe/fully_async_policy/message_queue.py index da1780deb47..22573f4b9d5 100644 --- a/recipe/fully_async_policy/message_queue.py +++ b/recipe/fully_async_policy/message_queue.py @@ -261,4 +261,4 @@ def get_statistics_sync(self) -> dict[str, Any]: def update_param_version_sync(self, version: int): """Update parameter version (async)""" - return ray.get(self.queue_actor.update_param_version.remote(version)) \ No newline at end of file + return ray.get(self.queue_actor.update_param_version.remote(version)) diff --git a/recipe/fully_async_policy/param_sync.py b/recipe/fully_async_policy/param_sync.py index 34fbca1c3e3..35efdd9c950 100644 --- a/recipe/fully_async_policy/param_sync.py +++ b/recipe/fully_async_policy/param_sync.py @@ -72,7 +72,7 @@ def _init_sync_group(self): group_name=self.sync_group_name, ) - def sync_weights(self, version, validate = False, global_steps = 0): + def sync_weights(self, version, validate=False, global_steps=0): start_time = time.time() self.current_version = version @@ -94,9 +94,8 @@ def sync_weights(self, version, validate = False, global_steps = 0): self.wait_last = self.rollouter.resume.remote() def wait_last_sync(self): - print(f"[ParameterSynchronizer] waiting last parameter sync and validate...") - start_time = time.time() + print("[ParameterSynchronizer] waiting last parameter sync and validate...") + start_time = time.time() if self.wait_last: ray.get(self.wait_last) print(f"[ParameterSynchronizer], cost: {time.time() - start_time:.2f} seconds") - diff --git a/verl/experimental/agent_loop/__init__.py b/verl/experimental/agent_loop/__init__.py index 0d131dd1d3a..67dcb16047e 100644 --- a/verl/experimental/agent_loop/__init__.py +++ b/verl/experimental/agent_loop/__init__.py @@ -13,8 +13,8 @@ # limitations under the License. from .agent_loop import AgentLoopBase, AgentLoopManager -from .single_turn_agent_loop import SingleTurnAgentLoop from .partial_single_turn_agent_loop import PartialSingleTurnAgentLoop +from .single_turn_agent_loop import SingleTurnAgentLoop from .tool_agent_loop import ToolAgentLoop _ = [SingleTurnAgentLoop, ToolAgentLoop, PartialSingleTurnAgentLoop] diff --git a/verl/experimental/agent_loop/agent_loop.py b/verl/experimental/agent_loop/agent_loop.py index 8c49390f456..117ca13a7a7 100644 --- a/verl/experimental/agent_loop/agent_loop.py +++ b/verl/experimental/agent_loop/agent_loop.py @@ -469,7 +469,10 @@ async def _run_agent_loop( server_manager=self.server_manager, tokenizer=self.tokenizer, ) - output = await agent_loop.run(messages, sampling_params, partial_output) + if agent_name == "partial_single_turn_agent": + output = await agent_loop.run(messages, sampling_params, partial_output) + else: + output = await agent_loop.run(messages, sampling_params) return output diff --git a/verl/experimental/agent_loop/single_turn_agent_loop.py b/verl/experimental/agent_loop/single_turn_agent_loop.py index df6e1991888..411388e7321 100644 --- a/verl/experimental/agent_loop/single_turn_agent_loop.py +++ b/verl/experimental/agent_loop/single_turn_agent_loop.py @@ -13,7 +13,7 @@ # limitations under the License. import logging import os -from typing import Any, Optional +from typing import Any from uuid import uuid4 from verl.experimental.agent_loop.agent_loop import AgentLoopBase, AgentLoopOutput, register @@ -32,9 +32,7 @@ def __init__(self, *args, **kwargs): self.prompt_length = self.config.actor_rollout_ref.rollout.prompt_length self.response_length = self.config.actor_rollout_ref.rollout.response_length - async def run( - self, messages: list[dict[str, Any]], sampling_params: dict[str, Any], output: Optional[AgentLoopOutput] - ) -> AgentLoopOutput: + async def run(self, messages: list[dict[str, Any]], sampling_params: dict[str, Any]) -> AgentLoopOutput: metrics = {} request_id = uuid4().hex prompt_ids = await self.loop.run_in_executor( diff --git a/verl/experimental/agent_loop/tool_agent_loop.py b/verl/experimental/agent_loop/tool_agent_loop.py index 7c945b7d4c9..3437c0be5ab 100644 --- a/verl/experimental/agent_loop/tool_agent_loop.py +++ b/verl/experimental/agent_loop/tool_agent_loop.py @@ -15,7 +15,7 @@ import json import logging import os -from typing import Any, Optional +from typing import Any from uuid import uuid4 from verl.experimental.agent_loop.agent_loop import AgentLoopBase, AgentLoopOutput, register @@ -56,9 +56,7 @@ def init_class(cls, config, tokenizer, **kwargs): cls.system_prompt = tokenizer.apply_chat_template([{}], add_generation_prompt=False, tokenize=True) @rollout_trace_op - async def run( - self, messages: list[dict[str, Any]], sampling_params: dict[str, Any], output: Optional[AgentLoopOutput] - ) -> AgentLoopOutput: + async def run(self, messages: list[dict[str, Any]], sampling_params: dict[str, Any]) -> AgentLoopOutput: metrics = {} request_id = uuid4().hex prompt_ids = await self.loop.run_in_executor( diff --git a/verl/trainer/ppo/ray_trainer.py b/verl/trainer/ppo/ray_trainer.py index 8d2c19d3364..56a1e5bcab1 100644 --- a/verl/trainer/ppo/ray_trainer.py +++ b/verl/trainer/ppo/ray_trainer.py @@ -1287,7 +1287,6 @@ def _process_batch_common(self, batch, metrics, timing_raw): } ) - if self.use_reference_policy: # compute reference log_prob with marked_timer("ref", timing_raw, color="olive"): From 7bd48597adc7a48ab77a1af34fe7265ed3c2e37b Mon Sep 17 00:00:00 2001 From: ArronHZG Date: Tue, 9 Sep 2025 22:25:12 +0800 Subject: [PATCH 120/182] refactor vllm async --- verl/experimental/agent_loop/agent_loop.py | 15 ++------------- .../rollout/vllm_rollout/vllm_async_server.py | 3 ++- .../rollout/vllm_rollout/vllm_rollout_spmd.py | 2 -- 3 files changed, 4 insertions(+), 16 deletions(-) diff --git a/verl/experimental/agent_loop/agent_loop.py b/verl/experimental/agent_loop/agent_loop.py index 117ca13a7a7..d00b7176380 100644 --- a/verl/experimental/agent_loop/agent_loop.py +++ b/verl/experimental/agent_loop/agent_loop.py @@ -182,16 +182,12 @@ def init_class(cls, config: DictConfig, tokenizer: AutoTokenizer, **kwargs): cls._class_initialized = True @abstractmethod - async def run( - self, messages: list[dict[str, Any]], sampling_params: dict[str, Any], partial_output: Optional[AgentLoopOutput] - ) -> AgentLoopOutput: + async def run(self, messages: list[dict[str, Any]], sampling_params: dict[str, Any]) -> AgentLoopOutput: """Run agent loop to interact with LLM server and environment. Args: messages (List[Dict[str, Any]]): Input messages. sampling_params (Dict[str, Any]): LLM sampling params. - partial_output: Optional[AgentLoopOutput]: already rollout result. - Returns: AgentLoopOutput: Agent loop output. """ @@ -567,17 +563,10 @@ def _initialize_llm_servers(self): def _init_agent_loop_workers(self): self.agent_loop_workers = [] - # 获取建议的资源配置 - agent_config = self.config.actor_rollout_ref.rollout.agent - max_concurrency = agent_config.get("max_concurrency", 10) - num_cpus = agent_config.get("num_cpus", 2) # 默认2个CPU核心 - - for i in range(agent_config.num_workers): + for i in range(self.config.actor_rollout_ref.rollout.agent.num_workers): self.agent_loop_workers.append( AgentLoopWorker.options( name=f"agent_loop_worker_{i}", - max_concurrency=max_concurrency, # 设置最大并发数 - num_cpus=num_cpus, # 设置CPU资源需求 ).remote(self.config, self.async_llm_servers) ) diff --git a/verl/workers/rollout/vllm_rollout/vllm_async_server.py b/verl/workers/rollout/vllm_rollout/vllm_async_server.py index 3b3e9542252..2dc386e76fa 100644 --- a/verl/workers/rollout/vllm_rollout/vllm_async_server.py +++ b/verl/workers/rollout/vllm_rollout/vllm_async_server.py @@ -74,6 +74,7 @@ def get_pg_index_and_local_rank(actor_name) -> tuple[int, int]: actor_names = sorted(actor_names, key=get_pg_index_and_local_rank) actor_names = actor_names[vllm_dp_rank * vllm_tp_size : (vllm_dp_rank + 1) * vllm_tp_size] workers: list[WorkerWrapperBase] = [ray.get_actor(actor_name) for actor_name in actor_names] + print(f"instance_id: {vllm_config.instance_id} initializes with external actors: {actor_names}") return workers @@ -205,8 +206,8 @@ def __init__(self, config: DictConfig, vllm_dp_size: int, vllm_dp_rank: int, wg_ self.vllm_dp_rank = vllm_dp_rank self.wg_prefix = wg_prefix self.engine: AsyncLLM = None - # for cancel + # for cancel LLMServer self.paused = False self.lock = asyncio.Lock() self.cancel_event: dict[str, asyncio.Event] = {} diff --git a/verl/workers/rollout/vllm_rollout/vllm_rollout_spmd.py b/verl/workers/rollout/vllm_rollout/vllm_rollout_spmd.py index 071dd917119..5bd571016ac 100644 --- a/verl/workers/rollout/vllm_rollout/vllm_rollout_spmd.py +++ b/verl/workers/rollout/vllm_rollout/vllm_rollout_spmd.py @@ -58,7 +58,6 @@ logger = logging.getLogger(__file__) logger.setLevel(os.getenv("VERL_LOGGING_LEVEL", "WARN")) - # TODO # 1. support pp in vllm # 2. passing tokenizer is not necessary? no encoding/decoding is happending here @@ -459,7 +458,6 @@ def get_zeromq_address(self): def init_worker(self, all_kwargs: list[dict[str, Any]]): """Initialize worker engine.""" - all_kwargs[0]["rank"] = int(os.environ["RANK"]) all_kwargs[0]["local_rank"] = 0 From ec3f0c52fb445968add8201e08f8f89467a31d80 Mon Sep 17 00:00:00 2001 From: ArronHZG Date: Tue, 9 Sep 2025 22:30:22 +0800 Subject: [PATCH 121/182] refactor logs --- verl/experimental/agent_loop/agent_loop.py | 1 + verl/workers/rollout/vllm_rollout/vllm_async_server.py | 2 +- 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/verl/experimental/agent_loop/agent_loop.py b/verl/experimental/agent_loop/agent_loop.py index d00b7176380..ddcad093326 100644 --- a/verl/experimental/agent_loop/agent_loop.py +++ b/verl/experimental/agent_loop/agent_loop.py @@ -188,6 +188,7 @@ async def run(self, messages: list[dict[str, Any]], sampling_params: dict[str, A Args: messages (List[Dict[str, Any]]): Input messages. sampling_params (Dict[str, Any]): LLM sampling params. + Returns: AgentLoopOutput: Agent loop output. """ diff --git a/verl/workers/rollout/vllm_rollout/vllm_async_server.py b/verl/workers/rollout/vllm_rollout/vllm_async_server.py index 2dc386e76fa..4826ebaa1d0 100644 --- a/verl/workers/rollout/vllm_rollout/vllm_async_server.py +++ b/verl/workers/rollout/vllm_rollout/vllm_async_server.py @@ -370,7 +370,7 @@ async def generate_for_partial( token_ids = self.req_output[request_id].outputs[0].token_ids log_probs: list[float] = [] for i, x in enumerate(self.req_output[request_id].outputs[0].logprobs): - # sampling_params 中 logprobs 设置为1,只返回1个 + # sampling_params 中 logprobs 设置为1,应该返回1个, 但是实测会有多个,取token_id所对应的log_prob token_id = self.req_output[request_id].outputs[0].token_ids[i] log_probs.append(x[token_id].logprob) is_cancel = generation_handle not in done From 547d68f8572c68626be7ae60766bdad9c39d65d2 Mon Sep 17 00:00:00 2001 From: ArronHZG Date: Wed, 10 Sep 2025 15:46:44 +0800 Subject: [PATCH 122/182] qwen3 A3B --- .../fsdp2_fully-async_64-64_stal0.1/run.sh | 168 ++++++++++++++++++ .../runtime_env.yaml | 4 + .../exp/qwen3-30BA3B_64/fsdp2_colocate/run.sh | 125 +++++++++++++ .../fsdp2_colocate/runtime_env.yaml | 5 + .../fsdp2_fully-async_32-32/run.sh | 150 ++++++++++++++++ .../fsdp2_fully-async_32-32/runtime_env.yaml | 4 + .../megatron_colocate/runtime_env.yaml | 5 +- 7 files changed, 457 insertions(+), 4 deletions(-) create mode 100644 recipe/fully_async_policy/exp/qwen2-7B-math_128/fsdp2_fully-async_64-64_stal0.1/run.sh create mode 100644 recipe/fully_async_policy/exp/qwen2-7B-math_128/fsdp2_fully-async_64-64_stal0.1/runtime_env.yaml create mode 100644 recipe/fully_async_policy/exp/qwen3-30BA3B_64/fsdp2_colocate/run.sh create mode 100644 recipe/fully_async_policy/exp/qwen3-30BA3B_64/fsdp2_colocate/runtime_env.yaml create mode 100644 recipe/fully_async_policy/exp/qwen3-30BA3B_64/fsdp2_fully-async_32-32/run.sh create mode 100644 recipe/fully_async_policy/exp/qwen3-30BA3B_64/fsdp2_fully-async_32-32/runtime_env.yaml diff --git a/recipe/fully_async_policy/exp/qwen2-7B-math_128/fsdp2_fully-async_64-64_stal0.1/run.sh b/recipe/fully_async_policy/exp/qwen2-7B-math_128/fsdp2_fully-async_64-64_stal0.1/run.sh new file mode 100644 index 00000000000..e9133e50eac --- /dev/null +++ b/recipe/fully_async_policy/exp/qwen2-7B-math_128/fsdp2_fully-async_64-64_stal0.1/run.sh @@ -0,0 +1,168 @@ +#!/usr/bin/env bash +set -xeuo pipefail + +project_name='DAPO' +exp_name='dapo_qwen2-7B-math_28k_fsdp2_fully-async_64-64_mbs32_tpf4_stal0.1' + +# Ray +# RAY_ADDRESS=${RAY_ADDRESS:-"http://localhost:8265"} +# WORKING_DIR=${WORKING_DIR:-"${PWD}"} +# RUNTIME_ENV=${RUNTIME_ENV:-"${WORKING_DIR}/verl/trainer/runtime_env.yaml"} +# Paths +RAY_DATA_HOME=${RAY_DATA_HOME:-"${HOME}/verl"} +# very important! please modify the max_position_embeddings in config.json to 32768 after downloading from huggingface +MODEL_PATH=${MODEL_PATH:-"${RAY_DATA_HOME}/models/Qwen2.5-Math-7B"} +CKPTS_DIR=${CKPTS_DIR:-"${RAY_DATA_HOME}/ckpts/${project_name}/${exp_name}"} +TRAIN_FILE=${TRAIN_FILE:-"${RAY_DATA_HOME}/data/dapo-math-17k.parquet"} +TEST_FILE=${TEST_FILE:-"${RAY_DATA_HOME}/data/aime-2024.parquet"} + +# MODEL_PATH=/mnt/dolphinfs/hdd_pool/docker/user/hadoop-djst-algoplat/houzhenggang/modelscope/hub/models/Qwen/Qwen2___5-Math-7B +MODEL_PATH=/mnt/dolphinfs/ssd_pool/docker/user/hadoop-friday-studio/FTI/houzhenggang/model/Qwen2___5-Math-7B +CKPTS_DIR=./ckpts/${project_name}/${exp_name} +# TRAIN_FILE=/mnt/dolphinfs/hdd_pool/docker/user/hadoop-djst-algoplat/houzhenggang/data/dapo/dapo-math-17k.parquet +# TEST_FILE=/mnt/dolphinfs/hdd_pool/docker/user/hadoop-djst-algoplat/houzhenggang/data/dapo/aime-2024.parquet +TRAIN_FILE=/mnt/dolphinfs/ssd_pool/docker/user/hadoop-friday-studio/FTI/houzhenggang/data/dapo/dapo-math-17k.parquet +TEST_FILE=/mnt/dolphinfs/ssd_pool/docker/user/hadoop-friday-studio/FTI/houzhenggang/data/dapo/aime-2024.parquet + +rollout_mode="async" +rollout_name="vllm" # sglang or vllm +if [ "$rollout_mode" = "async" ]; then + export VLLM_USE_V1=1 + return_raw_chat="True" +fi + +# Algorithm parameters +adv_estimator=grpo + +use_kl_in_reward=False +kl_coef=0.0 +use_kl_loss=False +kl_loss_coef=0.0 + +clip_ratio_low=0.2 +clip_ratio_high=0.28 + +# Response length parameters +max_prompt_length=$((1024 * 2)) +max_response_length=$((1024 * 28)) +enable_overlong_buffer=True +overlong_buffer_len=$((1024 * 4)) +overlong_penalty_factor=1.0 + +# Training parameters +loss_agg_mode="token-mean" + +# Algorithm +temperature=1.0 +top_p=1.0 +top_k=-1 # 0 for HF rollout, -1 for vLLM rollout +val_top_p=0.7 + +# Performance Related Parameter +use_dynamic_bsz=True +actor_ppo_max_token_len=$(((max_prompt_length + max_response_length) * 2)) +infer_ppo_max_token_len=$(((max_prompt_length + max_response_length) * 3)) +ref_offload=True +actor_offload=False +gen_tp=4 +sp_size=4 +fsdp_size=2 + +# Fully async specific parameters +NNODES_ROLLOUT=${NNODES_ROLLOUT:-8} +NNODES_TRAIN=${NNODES_TRAIN:-8} +NGPUS_PER_NODE=${NGPUS_PER_NODE:-8} + + +train_prompt_bsz=0 +gen_prompt_bsz=1 +n_resp_per_prompt=16 +train_prompt_mini_bsz=32 +total_rollout_steps=$(((512*400))) +test_freq=20 +staleness_threshold=0.1 +trigger_parameter_sync_step=4 +partial_rollout=True + +python -m recipe.fully_async_policy.fully_async_main \ + data.train_files="${TRAIN_FILE}" \ + data.val_files="${TEST_FILE}" \ + data.prompt_key=prompt \ + data.truncation='left' \ + data.max_prompt_length=${max_prompt_length} \ + data.max_response_length=${max_response_length} \ + data.train_batch_size=${train_prompt_bsz} \ + data.gen_batch_size=${gen_prompt_bsz} \ + data.return_raw_chat=${return_raw_chat} \ + actor_rollout_ref.rollout.n=${n_resp_per_prompt} \ + algorithm.adv_estimator=${adv_estimator} \ + algorithm.use_kl_in_reward=${use_kl_in_reward} \ + algorithm.kl_ctrl.kl_coef=${kl_coef} \ + actor_rollout_ref.actor.strategy=fsdp2 \ + critic.strategy=fsdp2 \ + actor_rollout_ref.actor.use_kl_loss=${use_kl_loss} \ + actor_rollout_ref.actor.kl_loss_coef=${kl_loss_coef} \ + actor_rollout_ref.actor.clip_ratio_low=${clip_ratio_low} \ + actor_rollout_ref.actor.clip_ratio_high=${clip_ratio_high} \ + actor_rollout_ref.actor.clip_ratio_c=10.0 \ + actor_rollout_ref.model.use_remove_padding=True \ + actor_rollout_ref.hybrid_engine=False \ + +actor_rollout_ref.model.override_config.max_position_embeddings=32768 \ + actor_rollout_ref.actor.use_dynamic_bsz=${use_dynamic_bsz} \ + actor_rollout_ref.ref.log_prob_use_dynamic_bsz=${use_dynamic_bsz} \ + actor_rollout_ref.rollout.log_prob_use_dynamic_bsz=${use_dynamic_bsz} \ + actor_rollout_ref.actor.ppo_max_token_len_per_gpu=${actor_ppo_max_token_len} \ + actor_rollout_ref.ref.log_prob_max_token_len_per_gpu=${infer_ppo_max_token_len} \ + actor_rollout_ref.rollout.log_prob_max_token_len_per_gpu=${infer_ppo_max_token_len} \ + actor_rollout_ref.model.path="${MODEL_PATH}" \ + actor_rollout_ref.actor.optim.lr=1e-6 \ + actor_rollout_ref.actor.optim.lr_warmup_steps=10 \ + actor_rollout_ref.actor.optim.weight_decay=0.1 \ + actor_rollout_ref.actor.ppo_mini_batch_size=${train_prompt_mini_bsz} \ + actor_rollout_ref.actor.fsdp_config.param_offload=${actor_offload} \ + actor_rollout_ref.actor.fsdp_config.optimizer_offload=${actor_offload} \ + actor_rollout_ref.actor.entropy_coeff=0 \ + actor_rollout_ref.actor.grad_clip=1.0 \ + actor_rollout_ref.actor.loss_agg_mode=${loss_agg_mode} \ + actor_rollout_ref.actor.ulysses_sequence_parallel_size=${sp_size} \ + actor_rollout_ref.rollout.gpu_memory_utilization=0.80 \ + actor_rollout_ref.rollout.tensor_model_parallel_size=${gen_tp} \ + actor_rollout_ref.rollout.enable_chunked_prefill=True \ + actor_rollout_ref.rollout.max_num_batched_tokens=$((max_prompt_length + max_response_length)) \ + actor_rollout_ref.rollout.temperature=${temperature} \ + actor_rollout_ref.rollout.top_p=${top_p} \ + actor_rollout_ref.rollout.top_k=${top_k} \ + actor_rollout_ref.rollout.val_kwargs.temperature=${temperature} \ + actor_rollout_ref.rollout.val_kwargs.top_p=${val_top_p} \ + actor_rollout_ref.rollout.val_kwargs.top_k=${top_k} \ + actor_rollout_ref.rollout.val_kwargs.do_sample=True \ + actor_rollout_ref.rollout.val_kwargs.n=1 \ + actor_rollout_ref.ref.fsdp_config.param_offload=${ref_offload} \ + actor_rollout_ref.ref.ulysses_sequence_parallel_size=${sp_size} \ + actor_rollout_ref.actor.fsdp_config.fsdp_size=${fsdp_size} \ + actor_rollout_ref.rollout.name=${rollout_name} \ + actor_rollout_ref.rollout.mode=${rollout_mode} \ + actor_rollout_ref.rollout.calculate_log_probs=True \ + reward_model.reward_manager=dapo \ + +reward_model.reward_kwargs.overlong_buffer_cfg.enable=${enable_overlong_buffer} \ + +reward_model.reward_kwargs.overlong_buffer_cfg.len=${overlong_buffer_len} \ + +reward_model.reward_kwargs.overlong_buffer_cfg.penalty_factor=${overlong_penalty_factor} \ + +reward_model.reward_kwargs.overlong_buffer_cfg.log=False \ + +reward_model.reward_kwargs.max_resp_len=${max_response_length} \ + trainer.logger=['console','tensorboard'] \ + trainer.project_name="${project_name}" \ + trainer.experiment_name="${exp_name}" \ + trainer.val_before_train=True \ + trainer.save_freq=-1 \ + trainer.default_local_dir="${CKPTS_DIR}" \ + trainer.resume_mode=auto \ + trainer.nnodes="${NNODES_TRAIN}" \ + trainer.n_gpus_per_node="${NGPUS_PER_NODE}" \ + rollout.nnodes="${NNODES_ROLLOUT}" \ + rollout.n_gpus_per_node="${NGPUS_PER_NODE}" \ + rollout.total_rollout_steps="${total_rollout_steps}" \ + rollout.total_epochs=10 \ + rollout.test_freq="${test_freq}" \ + async_training.staleness_threshold="${staleness_threshold}" \ + async_training.trigger_parameter_sync_step="${trigger_parameter_sync_step}" \ + async_training.partial_rollout="${partial_rollout}" diff --git a/recipe/fully_async_policy/exp/qwen2-7B-math_128/fsdp2_fully-async_64-64_stal0.1/runtime_env.yaml b/recipe/fully_async_policy/exp/qwen2-7B-math_128/fsdp2_fully-async_64-64_stal0.1/runtime_env.yaml new file mode 100644 index 00000000000..92bacbdd204 --- /dev/null +++ b/recipe/fully_async_policy/exp/qwen2-7B-math_128/fsdp2_fully-async_64-64_stal0.1/runtime_env.yaml @@ -0,0 +1,4 @@ +env_vars: + VLLM_USE_V1: "1" + TENSORBOARD_DIR: "/mnt/dolphinfs/ssd_pool/docker/user/hadoop-friday-studio/FTI/houzhenggang/expeirments/qwen2-7B-math-128/dapo_qwen2-7B-math_28k_fsdp2_fully-async_64-64_mbs32_tpf4_stal0.1" + HYDRA_FULL_ERROR: "1" \ No newline at end of file diff --git a/recipe/fully_async_policy/exp/qwen3-30BA3B_64/fsdp2_colocate/run.sh b/recipe/fully_async_policy/exp/qwen3-30BA3B_64/fsdp2_colocate/run.sh new file mode 100644 index 00000000000..5a0ca29d2a5 --- /dev/null +++ b/recipe/fully_async_policy/exp/qwen3-30BA3B_64/fsdp2_colocate/run.sh @@ -0,0 +1,125 @@ +#!/usr/bin/env bash +set -xeuo pipefail + +project_name='DAPO' +exp_name='dapo_qwen3-30BA3B_8k_fsdp2_colocate_64_mbs32' + +adv_estimator=grpo + +use_kl_in_reward=False +kl_coef=0.0 +use_kl_loss=False +kl_loss_coef=0.0 + +clip_ratio_low=0.2 +clip_ratio_high=0.28 + +max_prompt_length=$((1024 * 2)) +max_response_length=$((1024 * 8)) +enable_overlong_buffer=True +overlong_buffer_len=$((1024 * 4)) +overlong_penalty_factor=1.0 + +loss_agg_mode="token-mean" + +train_prompt_bsz=512 +n_resp_per_prompt=16 +train_prompt_mini_bsz=32 + +# Ray +# RAY_ADDRESS=${RAY_ADDRESS:-"http://localhost:8265"} +# WORKING_DIR=${WORKING_DIR:-"${PWD}"} +# RUNTIME_ENV=${RUNTIME_ENV:-"${WORKING_DIR}/verl/trainer/runtime_env.yaml"} +NNODES=${NNODES:-8} +NGPUS_PER_NODE=${NGPUS_PER_NODE:-8} +# Paths +MODEL_PATH=/mnt/dolphinfs/ssd_pool/docker/user/hadoop-friday-studio/.friday/models/basemodel/Qwen/Qwen3-30B-A3B +CKPTS_DIR=./ckpts/${project_name}/${exp_name} +TRAIN_FILE=/mnt/dolphinfs/ssd_pool/docker/user/hadoop-friday-studio/FTI/houzhenggang/data/dapo/dapo-math-17k.parquet +TEST_FILE=/mnt/dolphinfs/ssd_pool/docker/user/hadoop-friday-studio/FTI/houzhenggang/data/dapo/aime-2024.parquet + +# Algorithm +temperature=1.0 +top_p=1.0 +top_k=-1 # 0 for HF rollout, -1 for vLLM rollout +val_top_p=0.7 + +# Performance Related Parameter +sp_size=4 +use_dynamic_bsz=True +actor_ppo_max_token_len=$(((max_prompt_length + max_response_length) * 2)) +infer_ppo_max_token_len=$(((max_prompt_length + max_response_length) * 3)) +offload=True +gen_tp=4 +fsdp_size=32 + +python3 -m verl.trainer.main_ppo \ + data.train_files="${TRAIN_FILE}" \ + data.val_files="${TEST_FILE}" \ + data.prompt_key=prompt \ + data.truncation='left' \ + data.max_prompt_length=${max_prompt_length} \ + data.max_response_length=${max_response_length} \ + data.train_batch_size=${train_prompt_bsz} \ + actor_rollout_ref.rollout.n=${n_resp_per_prompt} \ + algorithm.adv_estimator=${adv_estimator} \ + algorithm.use_kl_in_reward=${use_kl_in_reward} \ + algorithm.kl_ctrl.kl_coef=${kl_coef} \ + actor_rollout_ref.actor.use_kl_loss=${use_kl_loss} \ + actor_rollout_ref.actor.kl_loss_coef=${kl_loss_coef} \ + actor_rollout_ref.actor.clip_ratio_low=${clip_ratio_low} \ + actor_rollout_ref.actor.clip_ratio_high=${clip_ratio_high} \ + actor_rollout_ref.actor.clip_ratio_c=10.0 \ + actor_rollout_ref.model.use_remove_padding=True \ + actor_rollout_ref.actor.use_dynamic_bsz=${use_dynamic_bsz} \ + actor_rollout_ref.ref.log_prob_use_dynamic_bsz=${use_dynamic_bsz} \ + actor_rollout_ref.rollout.log_prob_use_dynamic_bsz=${use_dynamic_bsz} \ + actor_rollout_ref.actor.ppo_max_token_len_per_gpu=${actor_ppo_max_token_len} \ + actor_rollout_ref.ref.log_prob_max_token_len_per_gpu=${infer_ppo_max_token_len} \ + actor_rollout_ref.rollout.log_prob_max_token_len_per_gpu=${infer_ppo_max_token_len} \ + actor_rollout_ref.model.path="${MODEL_PATH}" \ + actor_rollout_ref.model.enable_gradient_checkpointing=True \ + actor_rollout_ref.actor.optim.lr=1e-6 \ + actor_rollout_ref.actor.optim.lr_warmup_steps=10 \ + actor_rollout_ref.actor.optim.weight_decay=0.1 \ + actor_rollout_ref.actor.ppo_mini_batch_size=${train_prompt_mini_bsz} \ + actor_rollout_ref.actor.fsdp_config.param_offload=${offload} \ + actor_rollout_ref.actor.fsdp_config.optimizer_offload=${offload} \ + actor_rollout_ref.actor.entropy_coeff=0 \ + actor_rollout_ref.actor.grad_clip=1.0 \ + actor_rollout_ref.actor.loss_agg_mode=${loss_agg_mode} \ + actor_rollout_ref.actor.ulysses_sequence_parallel_size=${sp_size} \ + actor_rollout_ref.rollout.gpu_memory_utilization=0.80 \ + actor_rollout_ref.rollout.tensor_model_parallel_size=${gen_tp} \ + actor_rollout_ref.rollout.enable_chunked_prefill=True \ + actor_rollout_ref.rollout.max_num_batched_tokens=$((max_prompt_length + max_response_length)) \ + actor_rollout_ref.rollout.temperature=${temperature} \ + actor_rollout_ref.rollout.top_p=${top_p} \ + actor_rollout_ref.rollout.top_k=${top_k} \ + actor_rollout_ref.rollout.val_kwargs.temperature=${temperature} \ + actor_rollout_ref.rollout.val_kwargs.top_p=${val_top_p} \ + actor_rollout_ref.rollout.val_kwargs.top_k=${top_k} \ + actor_rollout_ref.rollout.val_kwargs.do_sample=True \ + actor_rollout_ref.rollout.val_kwargs.n=1 \ + actor_rollout_ref.ref.fsdp_config.param_offload=${offload} \ + actor_rollout_ref.ref.ulysses_sequence_parallel_size=${sp_size} \ + actor_rollout_ref.actor.fsdp_config.fsdp_size=${fsdp_size} \ + reward_model.reward_manager=dapo \ + +reward_model.reward_kwargs.overlong_buffer_cfg.enable=${enable_overlong_buffer} \ + +reward_model.reward_kwargs.overlong_buffer_cfg.len=${overlong_buffer_len} \ + +reward_model.reward_kwargs.overlong_buffer_cfg.penalty_factor=${overlong_penalty_factor} \ + +reward_model.reward_kwargs.overlong_buffer_cfg.log=False \ + +reward_model.reward_kwargs.max_resp_len=${max_response_length} \ + trainer.logger='["console","tensorboard"]' \ + trainer.project_name="${project_name}" \ + trainer.experiment_name="${exp_name}" \ + trainer.n_gpus_per_node="${NGPUS_PER_NODE}" \ + trainer.nnodes="${NNODES}" \ + trainer.val_before_train=True \ + trainer.test_freq=20 \ + trainer.save_freq=-1 \ + trainer.total_epochs=10 \ + trainer.total_training_steps=400 \ + trainer.default_local_dir="${CKPTS_DIR}" \ + trainer.resume_mode=auto \ + trainer.log_val_generations=10 diff --git a/recipe/fully_async_policy/exp/qwen3-30BA3B_64/fsdp2_colocate/runtime_env.yaml b/recipe/fully_async_policy/exp/qwen3-30BA3B_64/fsdp2_colocate/runtime_env.yaml new file mode 100644 index 00000000000..b2333e66179 --- /dev/null +++ b/recipe/fully_async_policy/exp/qwen3-30BA3B_64/fsdp2_colocate/runtime_env.yaml @@ -0,0 +1,5 @@ +env_vars: + TENSORBOARD_DIR: "/mnt/dolphinfs/ssd_pool/docker/user/hadoop-friday-studio/FTI/houzhenggang/expeirments/qwen3-30BA3B-64/dapo_qwen3-30BA3B_8k_fsdp2_colocate_64_mbs32" + HYDRA_FULL_ERROR: "1" + TORCH_NCCL_AVOID_RECORD_STREAMS: "1" + CUDA_DEVICE_MAX_CONNECTIONS: "1" \ No newline at end of file diff --git a/recipe/fully_async_policy/exp/qwen3-30BA3B_64/fsdp2_fully-async_32-32/run.sh b/recipe/fully_async_policy/exp/qwen3-30BA3B_64/fsdp2_fully-async_32-32/run.sh new file mode 100644 index 00000000000..1cee5cce560 --- /dev/null +++ b/recipe/fully_async_policy/exp/qwen3-30BA3B_64/fsdp2_fully-async_32-32/run.sh @@ -0,0 +1,150 @@ +#!/usr/bin/env bash +set -xeuo pipefail + +project_name='DAPO' +exp_name='dapo_qwen3-30BA3B_8k_fsdp2_async_32-32_mbs32_tpf8' + +adv_estimator=grpo + +use_kl_in_reward=False +kl_coef=0.0 +use_kl_loss=False +kl_loss_coef=0.0 + +clip_ratio_low=0.2 +clip_ratio_high=0.28 + +max_prompt_length=$((1024 * 2)) +max_response_length=$((1024 * 8)) +enable_overlong_buffer=True +overlong_buffer_len=$((1024 * 4)) +overlong_penalty_factor=1.0 + +loss_agg_mode="token-mean" + +# Paths +MODEL_PATH=/mnt/dolphinfs/ssd_pool/docker/user/hadoop-friday-studio/.friday/models/basemodel/Qwen/Qwen3-30B-A3B +CKPTS_DIR=./ckpts/${project_name}/${exp_name} +TRAIN_FILE=/mnt/dolphinfs/ssd_pool/docker/user/hadoop-friday-studio/FTI/houzhenggang/data/dapo/dapo-math-17k.parquet +TEST_FILE=/mnt/dolphinfs/ssd_pool/docker/user/hadoop-friday-studio/FTI/houzhenggang/data/dapo/aime-2024.parquet + +# Algorithm +temperature=1.0 +top_p=1.0 +top_k=-1 # 0 for HF rollout, -1 for vLLM rollout +val_top_p=0.7 + +# Performance Related Parameter +sp_size=4 +use_dynamic_bsz=True +actor_ppo_max_token_len=$(((max_prompt_length + max_response_length) * 2)) +infer_ppo_max_token_len=$(((max_prompt_length + max_response_length) * 3)) +ref_offload=True +actor_offload=False +gen_tp=4 +fsdp_size=32 + +# Fully async specific parameters +NNODES_ROLLOUT=${NNODES_ROLLOUT:-4} +NNODES_TRAIN=${NNODES_TRAIN:-4} +NGPUS_PER_NODE=${NGPUS_PER_NODE:-8} + +train_prompt_bsz=0 +gen_prompt_bsz=1 +n_resp_per_prompt=16 +train_prompt_mini_bsz=32 +total_rollout_steps=$(((512*400))) +test_freq=20 +staleness_threshold=0.1 +trigger_parameter_sync_step=8 +partial_rollout=True + +rollout_mode="async" +rollout_name="vllm" # sglang or vllm +if [ "$rollout_mode" = "async" ]; then + export VLLM_USE_V1=1 + return_raw_chat="True" +fi + +python -m recipe.fully_async_policy.fully_async_main \ + data.train_files="${TRAIN_FILE}" \ + data.val_files="${TEST_FILE}" \ + data.prompt_key=prompt \ + data.truncation='left' \ + data.max_prompt_length=${max_prompt_length} \ + data.max_response_length=${max_response_length} \ + data.train_batch_size=${train_prompt_bsz} \ + data.gen_batch_size=${gen_prompt_bsz} \ + data.return_raw_chat=${return_raw_chat} \ + actor_rollout_ref.rollout.n=${n_resp_per_prompt} \ + algorithm.adv_estimator=${adv_estimator} \ + algorithm.use_kl_in_reward=${use_kl_in_reward} \ + algorithm.kl_ctrl.kl_coef=${kl_coef} \ + actor_rollout_ref.actor.strategy=fsdp2 \ + critic.strategy=fsdp2 \ + actor_rollout_ref.actor.use_kl_loss=${use_kl_loss} \ + actor_rollout_ref.actor.kl_loss_coef=${kl_loss_coef} \ + actor_rollout_ref.actor.clip_ratio_low=${clip_ratio_low} \ + actor_rollout_ref.actor.clip_ratio_high=${clip_ratio_high} \ + actor_rollout_ref.actor.clip_ratio_c=10.0 \ + actor_rollout_ref.model.use_remove_padding=True \ + actor_rollout_ref.hybrid_engine=False \ + +actor_rollout_ref.model.override_config.max_position_embeddings=32768 \ + actor_rollout_ref.actor.use_dynamic_bsz=${use_dynamic_bsz} \ + actor_rollout_ref.ref.log_prob_use_dynamic_bsz=${use_dynamic_bsz} \ + actor_rollout_ref.rollout.log_prob_use_dynamic_bsz=${use_dynamic_bsz} \ + actor_rollout_ref.actor.ppo_max_token_len_per_gpu=${actor_ppo_max_token_len} \ + actor_rollout_ref.ref.log_prob_max_token_len_per_gpu=${infer_ppo_max_token_len} \ + actor_rollout_ref.rollout.log_prob_max_token_len_per_gpu=${infer_ppo_max_token_len} \ + actor_rollout_ref.model.path="${MODEL_PATH}" \ + actor_rollout_ref.actor.optim.lr=1e-6 \ + actor_rollout_ref.actor.optim.lr_warmup_steps=10 \ + actor_rollout_ref.actor.optim.weight_decay=0.1 \ + actor_rollout_ref.actor.ppo_mini_batch_size=${train_prompt_mini_bsz} \ + actor_rollout_ref.actor.fsdp_config.param_offload=${actor_offload} \ + actor_rollout_ref.actor.fsdp_config.optimizer_offload=${actor_offload} \ + actor_rollout_ref.actor.entropy_coeff=0 \ + actor_rollout_ref.actor.grad_clip=1.0 \ + actor_rollout_ref.actor.loss_agg_mode=${loss_agg_mode} \ + actor_rollout_ref.actor.ulysses_sequence_parallel_size=${sp_size} \ + actor_rollout_ref.rollout.gpu_memory_utilization=0.80 \ + actor_rollout_ref.rollout.tensor_model_parallel_size=${gen_tp} \ + actor_rollout_ref.rollout.enable_chunked_prefill=True \ + actor_rollout_ref.rollout.max_num_batched_tokens=$((max_prompt_length + max_response_length)) \ + actor_rollout_ref.rollout.temperature=${temperature} \ + actor_rollout_ref.rollout.top_p=${top_p} \ + actor_rollout_ref.rollout.top_k=${top_k} \ + actor_rollout_ref.rollout.val_kwargs.temperature=${temperature} \ + actor_rollout_ref.rollout.val_kwargs.top_p=${val_top_p} \ + actor_rollout_ref.rollout.val_kwargs.top_k=${top_k} \ + actor_rollout_ref.rollout.val_kwargs.do_sample=True \ + actor_rollout_ref.rollout.val_kwargs.n=1 \ + actor_rollout_ref.ref.fsdp_config.param_offload=${ref_offload} \ + actor_rollout_ref.ref.ulysses_sequence_parallel_size=${sp_size} \ + actor_rollout_ref.actor.fsdp_config.fsdp_size=${fsdp_size} \ + actor_rollout_ref.rollout.name=${rollout_name} \ + actor_rollout_ref.rollout.mode=${rollout_mode} \ + actor_rollout_ref.rollout.calculate_log_probs=True \ + reward_model.reward_manager=dapo \ + +reward_model.reward_kwargs.overlong_buffer_cfg.enable=${enable_overlong_buffer} \ + +reward_model.reward_kwargs.overlong_buffer_cfg.len=${overlong_buffer_len} \ + +reward_model.reward_kwargs.overlong_buffer_cfg.penalty_factor=${overlong_penalty_factor} \ + +reward_model.reward_kwargs.overlong_buffer_cfg.log=False \ + +reward_model.reward_kwargs.max_resp_len=${max_response_length} \ + trainer.logger=['console','tensorboard'] \ + trainer.project_name="${project_name}" \ + trainer.experiment_name="${exp_name}" \ + trainer.val_before_train=True \ + trainer.save_freq=-1 \ + trainer.default_local_dir="${CKPTS_DIR}" \ + trainer.resume_mode=auto \ + trainer.nnodes="${NNODES_TRAIN}" \ + trainer.n_gpus_per_node="${NGPUS_PER_NODE}" \ + rollout.nnodes="${NNODES_ROLLOUT}" \ + rollout.n_gpus_per_node="${NGPUS_PER_NODE}" \ + rollout.total_rollout_steps="${total_rollout_steps}" \ + rollout.total_epochs=10 \ + rollout.test_freq="${test_freq}" \ + async_training.staleness_threshold="${staleness_threshold}" \ + async_training.trigger_parameter_sync_step="${trigger_parameter_sync_step}" \ + async_training.partial_rollout="${partial_rollout}" diff --git a/recipe/fully_async_policy/exp/qwen3-30BA3B_64/fsdp2_fully-async_32-32/runtime_env.yaml b/recipe/fully_async_policy/exp/qwen3-30BA3B_64/fsdp2_fully-async_32-32/runtime_env.yaml new file mode 100644 index 00000000000..817cea30d09 --- /dev/null +++ b/recipe/fully_async_policy/exp/qwen3-30BA3B_64/fsdp2_fully-async_32-32/runtime_env.yaml @@ -0,0 +1,4 @@ +env_vars: + TENSORBOARD_DIR: "/mnt/dolphinfs/ssd_pool/docker/user/hadoop-friday-studio/FTI/houzhenggang/expeirments/qwen3-30BA3B-64/dapo_qwen3-30BA3B_8k_fsdp2_async_32-32_mbs32_tpf8" + HYDRA_FULL_ERROR: "1" + VLLM_USE_V1: "1" \ No newline at end of file diff --git a/recipe/fully_async_policy/exp/qwen3-30BA3B_64/megatron_colocate/runtime_env.yaml b/recipe/fully_async_policy/exp/qwen3-30BA3B_64/megatron_colocate/runtime_env.yaml index 3a497e90dd0..3fa60a48917 100644 --- a/recipe/fully_async_policy/exp/qwen3-30BA3B_64/megatron_colocate/runtime_env.yaml +++ b/recipe/fully_async_policy/exp/qwen3-30BA3B_64/megatron_colocate/runtime_env.yaml @@ -1,5 +1,2 @@ env_vars: - TENSORBOARD_DIR: "/mnt/dolphinfs/ssd_pool/docker/user/hadoop-friday-studio/FTI/houzhenggang/expeirments/qwen3-30BA3B-64/dapo_qwen3-30BA3B_32k_megatron_colocate_64_mbs32" - HYDRA_FULL_ERROR: "1" - TORCH_NCCL_AVOID_RECORD_STREAMS: "1" - CUDA_DEVICE_MAX_CONNECTIONS: "1" \ No newline at end of file + TENSORBOARD_DIR: "/mnt/dolphinfs/ssd_pool/docker/user/hadoop-friday-studio/FTI/houzhenggang/expeirments/qwen3-30BA3B-64/dapo_qwen3-30BA3B_8k_fsdp2_colocate_64_mbs32" From 1fc52bb5087d50858dbb9127d0d684d0d195f2a6 Mon Sep 17 00:00:00 2001 From: ArronHZG Date: Wed, 10 Sep 2025 19:45:07 +0800 Subject: [PATCH 123/182] staleness_threshold=0.1 --- .../exp/qwen2-32B_128/fsdp2_colocate/run.sh | 133 +++++++++++++++ .../fsdp2_colocate}/runtime_env.yaml | 4 +- .../fsdp2_fully-async_64-64/run.sh | 153 +++++++++++++++++ .../fsdp2_fully-async_64-64/runtime_env.yaml | 4 + ...{dapo_7b_math_fsdp2_colocate.sh => run.sh} | 0 ...fully-async_64-64_mbs32_tfq4.sh => run.sh} | 2 +- ...po_7b_math_megatron_colocate.sh => run.sh} | 0 ...{dapo_7b_math_fsdp2_colocate.sh => run.sh} | 0 ...ully-async_16-16_mbs32_tfq16.sh => run.sh} | 2 +- ...ully-async_16-16_mbs32_tfq16.sh => run.sh} | 0 ...fully-async_24-8_mbs32_tfq32.sh => run.sh} | 2 +- ...fully-async_8-24_mbs32_tfq11.sh => run.sh} | 2 +- ...po_7b_math_megatron_colocate.sh => run.sh} | 0 ...{dapo_7b_math_fsdp2_colocate.sh => run.sh} | 0 ...fully-async_24-40_mbs32_tfq6.sh => run.sh} | 2 +- ...fully-async_32-32_mbs32_tfq8.sh => run.sh} | 2 +- ...ully-async_40-24_mbs32_tfq11.sh => run.sh} | 2 +- ...po_7b_math_megatron_colocate.sh => run.sh} | 0 .../{test_dapo_qwen3_30b_math.sh => run.sh} | 0 .../{early_megatron_colocate.sh => run.sh} | 0 .../early_megatron_colocate.sh | 161 ------------------ .../megatron_colocate/runtime_env.yaml | 5 - .../early_megatron_colocate.sh | 161 ------------------ .../megatron_colocate/runtime_env.yaml | 5 - .../fsdp2_colocate/{fsdp2.sh => run.sh} | 0 .../{early_megatron_colocate.sh => run.sh} | 0 .../early_megatron_colocate.sh | 156 ----------------- .../early_megatron_colocate.sh | 156 ----------------- .../megatron_colocate/runtime_env.yaml | 5 - .../shell/dapo_7b_math_fsdp2_2_6.sh | 2 +- .../shell/dapo_7b_math_fsdp2_4_12.sh | 2 +- .../shell/dapo_7b_math_fsdp2_8_8.sh | 2 +- tests/special_e2e/run_fully_async_policy.sh | 2 +- 33 files changed, 303 insertions(+), 662 deletions(-) create mode 100644 recipe/fully_async_policy/exp/qwen2-32B_128/fsdp2_colocate/run.sh rename recipe/fully_async_policy/exp/{qwen3-32B_32/megatron_colocate => qwen2-32B_128/fsdp2_colocate}/runtime_env.yaml (50%) create mode 100644 recipe/fully_async_policy/exp/qwen2-32B_128/fsdp2_fully-async_64-64/run.sh create mode 100644 recipe/fully_async_policy/exp/qwen2-32B_128/fsdp2_fully-async_64-64/runtime_env.yaml rename recipe/fully_async_policy/exp/qwen2-7B-math_128/fsdp2_colocate/{dapo_7b_math_fsdp2_colocate.sh => run.sh} (100%) rename recipe/fully_async_policy/exp/qwen2-7B-math_128/fsdp2_fully-async_64-64/{dapo_qwen2-7B-math_28k_fsdp2_fully-async_64-64_mbs32_tfq4.sh => run.sh} (99%) rename recipe/fully_async_policy/exp/qwen2-7B-math_128/megatron_colocate/{dapo_7b_math_megatron_colocate.sh => run.sh} (100%) rename recipe/fully_async_policy/exp/qwen2-7B-math_32/fsdp2_colocate/{dapo_7b_math_fsdp2_colocate.sh => run.sh} (100%) rename recipe/fully_async_policy/exp/qwen2-7B-math_32/fsdp2_fully-async_16-16/{dapo_qwen2-7B-math_28k_fsdp2_fully-async_16-16_mbs32_tfq16.sh => run.sh} (99%) rename recipe/fully_async_policy/exp/qwen2-7B-math_32/fsdp2_fully-async_16-16_stal0.1/{dapo_qwen2-7B-math_28k_fsdp2_fully-async_16-16_mbs32_tfq16.sh => run.sh} (100%) rename recipe/fully_async_policy/exp/qwen2-7B-math_32/fsdp2_fully-async_24-8/{dapo_qwen2-7B-math_28k_fsdp2_fully-async_24-8_mbs32_tfq32.sh => run.sh} (99%) rename recipe/fully_async_policy/exp/qwen2-7B-math_32/fsdp2_fully-async_8-24/{dapo_qwen2-7B-math_28k_fsdp2_fully-async_8-24_mbs32_tfq11.sh => run.sh} (99%) rename recipe/fully_async_policy/exp/qwen2-7B-math_32/megatron_colocate/{dapo_7b_math_megatron_colocate.sh => run.sh} (100%) rename recipe/fully_async_policy/exp/qwen2-7B-math_64/fsdp2_colocate/{dapo_7b_math_fsdp2_colocate.sh => run.sh} (100%) rename recipe/fully_async_policy/exp/qwen2-7B-math_64/fsdp2_fully-async_24-40/{dapo_qwen2-7B-math_28k_fsdp2_fully-async_24-40_mbs32_tfq6.sh => run.sh} (99%) rename recipe/fully_async_policy/exp/qwen2-7B-math_64/fsdp2_fully-async_32-32/{dapo_qwen2-7B-math_28k_fsdp2_fully-async_32-32_mbs32_tfq8.sh => run.sh} (99%) rename recipe/fully_async_policy/exp/qwen2-7B-math_64/fsdp2_fully-async_40-24/{dapo_qwen2-7B-math_28k_fsdp2_fully-async_40-24_mbs32_tfq11.sh => run.sh} (99%) rename recipe/fully_async_policy/exp/qwen2-7B-math_64/megatron_colocate/{dapo_7b_math_megatron_colocate.sh => run.sh} (100%) rename recipe/fully_async_policy/exp/qwen3-30BA3B_128/fsdp2_colocate/{test_dapo_qwen3_30b_math.sh => run.sh} (100%) rename recipe/fully_async_policy/exp/qwen3-30BA3B_128/megatron_colocate/{early_megatron_colocate.sh => run.sh} (100%) delete mode 100644 recipe/fully_async_policy/exp/qwen3-30BA3B_32/megatron_colocate/early_megatron_colocate.sh delete mode 100644 recipe/fully_async_policy/exp/qwen3-30BA3B_32/megatron_colocate/runtime_env.yaml delete mode 100644 recipe/fully_async_policy/exp/qwen3-30BA3B_64/megatron_colocate/early_megatron_colocate.sh delete mode 100644 recipe/fully_async_policy/exp/qwen3-30BA3B_64/megatron_colocate/runtime_env.yaml rename recipe/fully_async_policy/exp/qwen3-32B_128/fsdp2_colocate/{fsdp2.sh => run.sh} (100%) rename recipe/fully_async_policy/exp/qwen3-32B_128/megatron_colocate/{early_megatron_colocate.sh => run.sh} (100%) delete mode 100644 recipe/fully_async_policy/exp/qwen3-32B_32/megatron_colocate/early_megatron_colocate.sh delete mode 100644 recipe/fully_async_policy/exp/qwen3-32B_64/megatron_colocate/early_megatron_colocate.sh delete mode 100644 recipe/fully_async_policy/exp/qwen3-32B_64/megatron_colocate/runtime_env.yaml diff --git a/recipe/fully_async_policy/exp/qwen2-32B_128/fsdp2_colocate/run.sh b/recipe/fully_async_policy/exp/qwen2-32B_128/fsdp2_colocate/run.sh new file mode 100644 index 00000000000..92203a7d87a --- /dev/null +++ b/recipe/fully_async_policy/exp/qwen2-32B_128/fsdp2_colocate/run.sh @@ -0,0 +1,133 @@ +#!/usr/bin/env bash +set -xeuo pipefail + +project_name='DAPO' +exp_name='dapo_qwen2-32B_20k_fsdp2_colocate_128' + +adv_estimator=grpo + +use_kl_in_reward=False +kl_coef=0.0 +use_kl_loss=False +kl_loss_coef=0.0 + +clip_ratio_low=0.2 +clip_ratio_high=0.28 + +max_prompt_length=$((1024 * 2)) +max_response_length=$((1024 * 20)) +enable_overlong_buffer=True +overlong_buffer_len=$((1024 * 4)) +overlong_penalty_factor=1.0 + +loss_agg_mode="token-mean" + +train_prompt_bsz=512 +n_resp_per_prompt=16 +train_prompt_mini_bsz=32 + +# Ray +# RAY_ADDRESS=${RAY_ADDRESS:-"http://localhost:8265"} +# WORKING_DIR=${WORKING_DIR:-"${PWD}"} +# RUNTIME_ENV=${RUNTIME_ENV:-"${WORKING_DIR}/verl/trainer/runtime_env.yaml"} +NNODES=${NNODES:-16} +NGPUS_PER_NODE=${NGPUS_PER_NODE:-8} +# Paths +RAY_DATA_HOME=${RAY_DATA_HOME:-"${HOME}/verl"} +# very important! please modify the max_position_embeddings in config.json to 32768 after downloading from huggingface + +MODEL_PATH=/mnt/dolphinfs/ssd_pool/docker/user/hadoop-friday-studio/.friday/models/Qwen2.5-32B +CKPTS_DIR=./ckpts/${project_name}/${exp_name} +TRAIN_FILE=/mnt/dolphinfs/ssd_pool/docker/user/hadoop-friday-studio/FTI/houzhenggang/data/dapo/dapo-math-17k.parquet +TEST_FILE=/mnt/dolphinfs/ssd_pool/docker/user/hadoop-friday-studio/FTI/houzhenggang/data/dapo/aime-2024.parquet +# Algorithm +temperature=1.0 +top_p=1.0 +top_k=-1 # 0 for HF rollout, -1 for vLLM rollout +val_top_p=0.7 + +# Performance Related Parameter +use_dynamic_bsz=True +actor_ppo_max_token_len=$(((max_prompt_length + max_response_length) * 2)) +infer_ppo_max_token_len=$(((max_prompt_length + max_response_length) * 3)) +offload=True +gen_tp=4 +sp_size=8 +fsdp_size=-1 + +# reference run wandb: https://wandb.ai/verl-org/DAPO%20Reproduction%20on%20verl/runs/ow47vvon?nw=nwusertongyuxuan361 + + +python -m verl.trainer.main_ppo \ + data.train_files="${TRAIN_FILE}" \ + data.val_files="${TEST_FILE}" \ + data.prompt_key=prompt \ + data.truncation='left' \ + data.max_prompt_length=${max_prompt_length} \ + data.max_response_length=${max_response_length} \ + data.train_batch_size=${train_prompt_bsz} \ + actor_rollout_ref.rollout.n=${n_resp_per_prompt} \ + algorithm.adv_estimator=${adv_estimator} \ + algorithm.use_kl_in_reward=${use_kl_in_reward} \ + algorithm.kl_ctrl.kl_coef=${kl_coef} \ + actor_rollout_ref.actor.strategy=fsdp2 \ + critic.strategy=fsdp2 \ + actor_rollout_ref.actor.use_kl_loss=${use_kl_loss} \ + actor_rollout_ref.actor.kl_loss_coef=${kl_loss_coef} \ + actor_rollout_ref.actor.clip_ratio_low=${clip_ratio_low} \ + actor_rollout_ref.actor.clip_ratio_high=${clip_ratio_high} \ + actor_rollout_ref.actor.clip_ratio_c=10.0 \ + actor_rollout_ref.model.use_remove_padding=True \ + +actor_rollout_ref.model.override_config.max_position_embeddings=32768 \ + actor_rollout_ref.actor.use_dynamic_bsz=${use_dynamic_bsz} \ + actor_rollout_ref.ref.log_prob_use_dynamic_bsz=${use_dynamic_bsz} \ + actor_rollout_ref.rollout.log_prob_use_dynamic_bsz=${use_dynamic_bsz} \ + actor_rollout_ref.actor.ppo_max_token_len_per_gpu=${actor_ppo_max_token_len} \ + actor_rollout_ref.ref.log_prob_max_token_len_per_gpu=${infer_ppo_max_token_len} \ + actor_rollout_ref.rollout.log_prob_max_token_len_per_gpu=${infer_ppo_max_token_len} \ + actor_rollout_ref.model.path="${MODEL_PATH}" \ + actor_rollout_ref.model.enable_gradient_checkpointing=True \ + actor_rollout_ref.actor.optim.lr=1e-6 \ + actor_rollout_ref.actor.optim.lr_warmup_steps=10 \ + actor_rollout_ref.actor.optim.weight_decay=0.1 \ + actor_rollout_ref.actor.ppo_mini_batch_size=${train_prompt_mini_bsz} \ + actor_rollout_ref.actor.fsdp_config.param_offload=${offload} \ + actor_rollout_ref.actor.fsdp_config.optimizer_offload=${offload} \ + actor_rollout_ref.actor.entropy_coeff=0 \ + actor_rollout_ref.actor.grad_clip=1.0 \ + actor_rollout_ref.actor.loss_agg_mode=${loss_agg_mode} \ + actor_rollout_ref.actor.ulysses_sequence_parallel_size=${sp_size} \ + actor_rollout_ref.rollout.gpu_memory_utilization=0.80 \ + actor_rollout_ref.rollout.tensor_model_parallel_size=${gen_tp} \ + actor_rollout_ref.rollout.enable_chunked_prefill=True \ + actor_rollout_ref.rollout.max_num_batched_tokens=$((max_prompt_length + max_response_length)) \ + actor_rollout_ref.rollout.temperature=${temperature} \ + actor_rollout_ref.rollout.top_p=${top_p} \ + actor_rollout_ref.rollout.top_k=${top_k} \ + actor_rollout_ref.rollout.val_kwargs.temperature=${temperature} \ + actor_rollout_ref.rollout.val_kwargs.top_p=${val_top_p} \ + actor_rollout_ref.rollout.val_kwargs.top_k=${top_k} \ + actor_rollout_ref.rollout.val_kwargs.do_sample=True \ + actor_rollout_ref.rollout.val_kwargs.n=1 \ + actor_rollout_ref.ref.fsdp_config.param_offload=${offload} \ + actor_rollout_ref.ref.ulysses_sequence_parallel_size=${sp_size} \ + actor_rollout_ref.actor.fsdp_config.fsdp_size=${fsdp_size} \ + reward_model.reward_manager=dapo \ + +reward_model.reward_kwargs.overlong_buffer_cfg.enable=${enable_overlong_buffer} \ + +reward_model.reward_kwargs.overlong_buffer_cfg.len=${overlong_buffer_len} \ + +reward_model.reward_kwargs.overlong_buffer_cfg.penalty_factor=${overlong_penalty_factor} \ + +reward_model.reward_kwargs.overlong_buffer_cfg.log=False \ + +reward_model.reward_kwargs.max_resp_len=${max_response_length} \ + trainer.logger=['console','tensorboard'] \ + trainer.project_name="${project_name}" \ + trainer.experiment_name="${exp_name}" \ + trainer.n_gpus_per_node="${NGPUS_PER_NODE}" \ + trainer.nnodes="${NNODES}" \ + trainer.val_before_train=True \ + trainer.test_freq=20 \ + trainer.save_freq=-1 \ + trainer.total_epochs=10 \ + trainer.total_training_steps=400 \ + trainer.default_local_dir="${CKPTS_DIR}" \ + trainer.resume_mode=auto \ + trainer.log_val_generations=10 diff --git a/recipe/fully_async_policy/exp/qwen3-32B_32/megatron_colocate/runtime_env.yaml b/recipe/fully_async_policy/exp/qwen2-32B_128/fsdp2_colocate/runtime_env.yaml similarity index 50% rename from recipe/fully_async_policy/exp/qwen3-32B_32/megatron_colocate/runtime_env.yaml rename to recipe/fully_async_policy/exp/qwen2-32B_128/fsdp2_colocate/runtime_env.yaml index 2d0930d13ab..e33cfd681ca 100644 --- a/recipe/fully_async_policy/exp/qwen3-32B_32/megatron_colocate/runtime_env.yaml +++ b/recipe/fully_async_policy/exp/qwen2-32B_128/fsdp2_colocate/runtime_env.yaml @@ -1,5 +1,5 @@ env_vars: - TENSORBOARD_DIR: "/mnt/dolphinfs/ssd_pool/docker/user/hadoop-friday-studio/FTI/houzhenggang/expeirments/qwen3-32B-32/dapo_qwen3-32B_32k_megatron_colocate_32_mbs32" + TENSORBOARD_DIR: "/mnt/dolphinfs/ssd_pool/docker/user/hadoop-friday-studio/FTI/houzhenggang/expeirments/qwen2-32B-128/dapo_qwen2-32B_20k_fsdp2_colocate_128" HYDRA_FULL_ERROR: "1" TORCH_NCCL_AVOID_RECORD_STREAMS: "1" - CUDA_DEVICE_MAX_CONNECTIONS: "1" \ No newline at end of file + VLLM_USE_V1: "1" \ No newline at end of file diff --git a/recipe/fully_async_policy/exp/qwen2-32B_128/fsdp2_fully-async_64-64/run.sh b/recipe/fully_async_policy/exp/qwen2-32B_128/fsdp2_fully-async_64-64/run.sh new file mode 100644 index 00000000000..270533a84c4 --- /dev/null +++ b/recipe/fully_async_policy/exp/qwen2-32B_128/fsdp2_fully-async_64-64/run.sh @@ -0,0 +1,153 @@ +#!/usr/bin/env bash +set -xeuo pipefail + +project_name='DAPO' +exp_name='dapo_qwen2-32B_20k_fsdp2_fully-async_64-64' + +# Paths +MODEL_PATH=/mnt/dolphinfs/ssd_pool/docker/user/hadoop-friday-studio/.friday/models/Qwen2.5-32B +CKPTS_DIR=./ckpts/${project_name}/${exp_name} +TRAIN_FILE=/mnt/dolphinfs/ssd_pool/docker/user/hadoop-friday-studio/FTI/houzhenggang/data/dapo/dapo-math-17k.parquet +TEST_FILE=/mnt/dolphinfs/ssd_pool/docker/user/hadoop-friday-studio/FTI/houzhenggang/data/dapo/aime-2024.parquet + +rollout_mode="async" +rollout_name="vllm" # sglang or vllm +if [ "$rollout_mode" = "async" ]; then + export VLLM_USE_V1=1 + return_raw_chat="True" +fi + +# Algorithm parameters +adv_estimator=grpo + +use_kl_in_reward=False +kl_coef=0.0 +use_kl_loss=False +kl_loss_coef=0.0 + +clip_ratio_low=0.2 +clip_ratio_high=0.28 + +# Response length parameters +max_prompt_length=$((1024 * 2)) +max_response_length=$((1024 * 20)) +enable_overlong_buffer=True +overlong_buffer_len=$((1024 * 4)) +overlong_penalty_factor=1.0 + +# Training parameters +loss_agg_mode="token-mean" + +# Algorithm +temperature=1.0 +top_p=1.0 +top_k=-1 # 0 for HF rollout, -1 for vLLM rollout +val_top_p=0.7 + +# Performance Related Parameter +use_dynamic_bsz=True +actor_ppo_max_token_len=$(((max_prompt_length + max_response_length) * 2)) +infer_ppo_max_token_len=$(((max_prompt_length + max_response_length) * 3)) +ref_offload=True +actor_offload=False +gen_tp=4 +sp_size=8 +fsdp_size=-1 + +# Fully async specific parameters +NNODES_ROLLOUT=${NNODES_ROLLOUT:-8} +NNODES_TRAIN=${NNODES_TRAIN:-8} +NGPUS_PER_NODE=${NGPUS_PER_NODE:-8} + +train_prompt_bsz=0 +gen_prompt_bsz=1 +n_resp_per_prompt=16 +train_prompt_mini_bsz=32 +total_rollout_steps=$(((512*400))) +test_freq=20 +staleness_threshold=0.1 +trigger_parameter_sync_step=4 +partial_rollout=True + +python -m recipe.fully_async_policy.fully_async_main \ + data.train_files="${TRAIN_FILE}" \ + data.val_files="${TEST_FILE}" \ + data.prompt_key=prompt \ + data.truncation='left' \ + data.max_prompt_length=${max_prompt_length} \ + data.max_response_length=${max_response_length} \ + data.train_batch_size=${train_prompt_bsz} \ + data.gen_batch_size=${gen_prompt_bsz} \ + data.return_raw_chat=${return_raw_chat} \ + actor_rollout_ref.rollout.n=${n_resp_per_prompt} \ + algorithm.adv_estimator=${adv_estimator} \ + algorithm.use_kl_in_reward=${use_kl_in_reward} \ + algorithm.kl_ctrl.kl_coef=${kl_coef} \ + actor_rollout_ref.actor.strategy=fsdp2 \ + critic.strategy=fsdp2 \ + actor_rollout_ref.actor.use_kl_loss=${use_kl_loss} \ + actor_rollout_ref.actor.kl_loss_coef=${kl_loss_coef} \ + actor_rollout_ref.actor.clip_ratio_low=${clip_ratio_low} \ + actor_rollout_ref.actor.clip_ratio_high=${clip_ratio_high} \ + actor_rollout_ref.actor.clip_ratio_c=10.0 \ + actor_rollout_ref.model.use_remove_padding=True \ + actor_rollout_ref.hybrid_engine=False \ + +actor_rollout_ref.model.override_config.max_position_embeddings=32768 \ + actor_rollout_ref.actor.use_dynamic_bsz=${use_dynamic_bsz} \ + actor_rollout_ref.ref.log_prob_use_dynamic_bsz=${use_dynamic_bsz} \ + actor_rollout_ref.rollout.log_prob_use_dynamic_bsz=${use_dynamic_bsz} \ + actor_rollout_ref.actor.ppo_max_token_len_per_gpu=${actor_ppo_max_token_len} \ + actor_rollout_ref.ref.log_prob_max_token_len_per_gpu=${infer_ppo_max_token_len} \ + actor_rollout_ref.rollout.log_prob_max_token_len_per_gpu=${infer_ppo_max_token_len} \ + actor_rollout_ref.model.path="${MODEL_PATH}" \ + actor_rollout_ref.actor.optim.lr=1e-6 \ + actor_rollout_ref.actor.optim.lr_warmup_steps=10 \ + actor_rollout_ref.actor.optim.weight_decay=0.1 \ + actor_rollout_ref.actor.ppo_mini_batch_size=${train_prompt_mini_bsz} \ + actor_rollout_ref.actor.fsdp_config.param_offload=${actor_offload} \ + actor_rollout_ref.actor.fsdp_config.optimizer_offload=${actor_offload} \ + actor_rollout_ref.actor.entropy_coeff=0 \ + actor_rollout_ref.actor.grad_clip=1.0 \ + actor_rollout_ref.actor.loss_agg_mode=${loss_agg_mode} \ + actor_rollout_ref.actor.ulysses_sequence_parallel_size=${sp_size} \ + actor_rollout_ref.rollout.gpu_memory_utilization=0.80 \ + actor_rollout_ref.rollout.tensor_model_parallel_size=${gen_tp} \ + actor_rollout_ref.rollout.enable_chunked_prefill=True \ + actor_rollout_ref.rollout.max_num_batched_tokens=$((max_prompt_length + max_response_length)) \ + actor_rollout_ref.rollout.temperature=${temperature} \ + actor_rollout_ref.rollout.top_p=${top_p} \ + actor_rollout_ref.rollout.top_k=${top_k} \ + actor_rollout_ref.rollout.val_kwargs.temperature=${temperature} \ + actor_rollout_ref.rollout.val_kwargs.top_p=${val_top_p} \ + actor_rollout_ref.rollout.val_kwargs.top_k=${top_k} \ + actor_rollout_ref.rollout.val_kwargs.do_sample=True \ + actor_rollout_ref.rollout.val_kwargs.n=1 \ + actor_rollout_ref.ref.fsdp_config.param_offload=${ref_offload} \ + actor_rollout_ref.ref.ulysses_sequence_parallel_size=${sp_size} \ + actor_rollout_ref.actor.fsdp_config.fsdp_size=${fsdp_size} \ + actor_rollout_ref.rollout.name=${rollout_name} \ + actor_rollout_ref.rollout.mode=${rollout_mode} \ + actor_rollout_ref.rollout.calculate_log_probs=True \ + reward_model.reward_manager=dapo \ + +reward_model.reward_kwargs.overlong_buffer_cfg.enable=${enable_overlong_buffer} \ + +reward_model.reward_kwargs.overlong_buffer_cfg.len=${overlong_buffer_len} \ + +reward_model.reward_kwargs.overlong_buffer_cfg.penalty_factor=${overlong_penalty_factor} \ + +reward_model.reward_kwargs.overlong_buffer_cfg.log=False \ + +reward_model.reward_kwargs.max_resp_len=${max_response_length} \ + trainer.logger=['console','tensorboard'] \ + trainer.project_name="${project_name}" \ + trainer.experiment_name="${exp_name}" \ + trainer.val_before_train=True \ + trainer.save_freq=-1 \ + trainer.default_local_dir="${CKPTS_DIR}" \ + trainer.resume_mode=auto \ + trainer.nnodes="${NNODES_TRAIN}" \ + trainer.n_gpus_per_node="${NGPUS_PER_NODE}" \ + rollout.nnodes="${NNODES_ROLLOUT}" \ + rollout.n_gpus_per_node="${NGPUS_PER_NODE}" \ + rollout.total_rollout_steps="${total_rollout_steps}" \ + rollout.total_epochs=10 \ + rollout.test_freq="${test_freq}" \ + async_training.staleness_threshold="${staleness_threshold}" \ + async_training.trigger_parameter_sync_step="${trigger_parameter_sync_step}" \ + async_training.partial_rollout="${partial_rollout}" diff --git a/recipe/fully_async_policy/exp/qwen2-32B_128/fsdp2_fully-async_64-64/runtime_env.yaml b/recipe/fully_async_policy/exp/qwen2-32B_128/fsdp2_fully-async_64-64/runtime_env.yaml new file mode 100644 index 00000000000..77590fb2709 --- /dev/null +++ b/recipe/fully_async_policy/exp/qwen2-32B_128/fsdp2_fully-async_64-64/runtime_env.yaml @@ -0,0 +1,4 @@ +env_vars: + VLLM_USE_V1: "1" + TENSORBOARD_DIR: "/mnt/dolphinfs/ssd_pool/docker/user/hadoop-friday-studio/FTI/houzhenggang/expeirments/qwen2-32B-128/dapo_qwen2-32B_20k_fsdp2_fully-async_64-64" + HYDRA_FULL_ERROR: "1" \ No newline at end of file diff --git a/recipe/fully_async_policy/exp/qwen2-7B-math_128/fsdp2_colocate/dapo_7b_math_fsdp2_colocate.sh b/recipe/fully_async_policy/exp/qwen2-7B-math_128/fsdp2_colocate/run.sh similarity index 100% rename from recipe/fully_async_policy/exp/qwen2-7B-math_128/fsdp2_colocate/dapo_7b_math_fsdp2_colocate.sh rename to recipe/fully_async_policy/exp/qwen2-7B-math_128/fsdp2_colocate/run.sh diff --git a/recipe/fully_async_policy/exp/qwen2-7B-math_128/fsdp2_fully-async_64-64/dapo_qwen2-7B-math_28k_fsdp2_fully-async_64-64_mbs32_tfq4.sh b/recipe/fully_async_policy/exp/qwen2-7B-math_128/fsdp2_fully-async_64-64/run.sh similarity index 99% rename from recipe/fully_async_policy/exp/qwen2-7B-math_128/fsdp2_fully-async_64-64/dapo_qwen2-7B-math_28k_fsdp2_fully-async_64-64_mbs32_tfq4.sh rename to recipe/fully_async_policy/exp/qwen2-7B-math_128/fsdp2_fully-async_64-64/run.sh index 9f410f95c6c..03ebab25cea 100644 --- a/recipe/fully_async_policy/exp/qwen2-7B-math_128/fsdp2_fully-async_64-64/dapo_qwen2-7B-math_28k_fsdp2_fully-async_64-64_mbs32_tfq4.sh +++ b/recipe/fully_async_policy/exp/qwen2-7B-math_128/fsdp2_fully-async_64-64/run.sh @@ -80,7 +80,7 @@ n_resp_per_prompt=16 train_prompt_mini_bsz=32 total_rollout_steps=$(((512*400))) test_freq=20 -staleness_threshold=1 +staleness_threshold=0.1 trigger_parameter_sync_step=4 partial_rollout=True diff --git a/recipe/fully_async_policy/exp/qwen2-7B-math_128/megatron_colocate/dapo_7b_math_megatron_colocate.sh b/recipe/fully_async_policy/exp/qwen2-7B-math_128/megatron_colocate/run.sh similarity index 100% rename from recipe/fully_async_policy/exp/qwen2-7B-math_128/megatron_colocate/dapo_7b_math_megatron_colocate.sh rename to recipe/fully_async_policy/exp/qwen2-7B-math_128/megatron_colocate/run.sh diff --git a/recipe/fully_async_policy/exp/qwen2-7B-math_32/fsdp2_colocate/dapo_7b_math_fsdp2_colocate.sh b/recipe/fully_async_policy/exp/qwen2-7B-math_32/fsdp2_colocate/run.sh similarity index 100% rename from recipe/fully_async_policy/exp/qwen2-7B-math_32/fsdp2_colocate/dapo_7b_math_fsdp2_colocate.sh rename to recipe/fully_async_policy/exp/qwen2-7B-math_32/fsdp2_colocate/run.sh diff --git a/recipe/fully_async_policy/exp/qwen2-7B-math_32/fsdp2_fully-async_16-16/dapo_qwen2-7B-math_28k_fsdp2_fully-async_16-16_mbs32_tfq16.sh b/recipe/fully_async_policy/exp/qwen2-7B-math_32/fsdp2_fully-async_16-16/run.sh similarity index 99% rename from recipe/fully_async_policy/exp/qwen2-7B-math_32/fsdp2_fully-async_16-16/dapo_qwen2-7B-math_28k_fsdp2_fully-async_16-16_mbs32_tfq16.sh rename to recipe/fully_async_policy/exp/qwen2-7B-math_32/fsdp2_fully-async_16-16/run.sh index fcc5f472d8c..cdefd5a4b57 100644 --- a/recipe/fully_async_policy/exp/qwen2-7B-math_32/fsdp2_fully-async_16-16/dapo_qwen2-7B-math_28k_fsdp2_fully-async_16-16_mbs32_tfq16.sh +++ b/recipe/fully_async_policy/exp/qwen2-7B-math_32/fsdp2_fully-async_16-16/run.sh @@ -80,7 +80,7 @@ n_resp_per_prompt=16 train_prompt_mini_bsz=32 total_rollout_steps=$(((512*400))) test_freq=20 -staleness_threshold=1 +staleness_threshold=0.1 trigger_parameter_sync_step=16 partial_rollout=True diff --git a/recipe/fully_async_policy/exp/qwen2-7B-math_32/fsdp2_fully-async_16-16_stal0.1/dapo_qwen2-7B-math_28k_fsdp2_fully-async_16-16_mbs32_tfq16.sh b/recipe/fully_async_policy/exp/qwen2-7B-math_32/fsdp2_fully-async_16-16_stal0.1/run.sh similarity index 100% rename from recipe/fully_async_policy/exp/qwen2-7B-math_32/fsdp2_fully-async_16-16_stal0.1/dapo_qwen2-7B-math_28k_fsdp2_fully-async_16-16_mbs32_tfq16.sh rename to recipe/fully_async_policy/exp/qwen2-7B-math_32/fsdp2_fully-async_16-16_stal0.1/run.sh diff --git a/recipe/fully_async_policy/exp/qwen2-7B-math_32/fsdp2_fully-async_24-8/dapo_qwen2-7B-math_28k_fsdp2_fully-async_24-8_mbs32_tfq32.sh b/recipe/fully_async_policy/exp/qwen2-7B-math_32/fsdp2_fully-async_24-8/run.sh similarity index 99% rename from recipe/fully_async_policy/exp/qwen2-7B-math_32/fsdp2_fully-async_24-8/dapo_qwen2-7B-math_28k_fsdp2_fully-async_24-8_mbs32_tfq32.sh rename to recipe/fully_async_policy/exp/qwen2-7B-math_32/fsdp2_fully-async_24-8/run.sh index 6c6cb13cf45..3de9279a9bc 100644 --- a/recipe/fully_async_policy/exp/qwen2-7B-math_32/fsdp2_fully-async_24-8/dapo_qwen2-7B-math_28k_fsdp2_fully-async_24-8_mbs32_tfq32.sh +++ b/recipe/fully_async_policy/exp/qwen2-7B-math_32/fsdp2_fully-async_24-8/run.sh @@ -80,7 +80,7 @@ n_resp_per_prompt=16 train_prompt_mini_bsz=32 total_rollout_steps=$(((512*400))) test_freq=20 -staleness_threshold=1 +staleness_threshold=0.1 trigger_parameter_sync_step=32 partial_rollout=True diff --git a/recipe/fully_async_policy/exp/qwen2-7B-math_32/fsdp2_fully-async_8-24/dapo_qwen2-7B-math_28k_fsdp2_fully-async_8-24_mbs32_tfq11.sh b/recipe/fully_async_policy/exp/qwen2-7B-math_32/fsdp2_fully-async_8-24/run.sh similarity index 99% rename from recipe/fully_async_policy/exp/qwen2-7B-math_32/fsdp2_fully-async_8-24/dapo_qwen2-7B-math_28k_fsdp2_fully-async_8-24_mbs32_tfq11.sh rename to recipe/fully_async_policy/exp/qwen2-7B-math_32/fsdp2_fully-async_8-24/run.sh index 9add4e0e8bb..4ba49146329 100644 --- a/recipe/fully_async_policy/exp/qwen2-7B-math_32/fsdp2_fully-async_8-24/dapo_qwen2-7B-math_28k_fsdp2_fully-async_8-24_mbs32_tfq11.sh +++ b/recipe/fully_async_policy/exp/qwen2-7B-math_32/fsdp2_fully-async_8-24/run.sh @@ -80,7 +80,7 @@ n_resp_per_prompt=16 train_prompt_mini_bsz=32 total_rollout_steps=$(((512*400))) test_freq=20 -staleness_threshold=1 +staleness_threshold=0.1 trigger_parameter_sync_step=11 partial_rollout=True diff --git a/recipe/fully_async_policy/exp/qwen2-7B-math_32/megatron_colocate/dapo_7b_math_megatron_colocate.sh b/recipe/fully_async_policy/exp/qwen2-7B-math_32/megatron_colocate/run.sh similarity index 100% rename from recipe/fully_async_policy/exp/qwen2-7B-math_32/megatron_colocate/dapo_7b_math_megatron_colocate.sh rename to recipe/fully_async_policy/exp/qwen2-7B-math_32/megatron_colocate/run.sh diff --git a/recipe/fully_async_policy/exp/qwen2-7B-math_64/fsdp2_colocate/dapo_7b_math_fsdp2_colocate.sh b/recipe/fully_async_policy/exp/qwen2-7B-math_64/fsdp2_colocate/run.sh similarity index 100% rename from recipe/fully_async_policy/exp/qwen2-7B-math_64/fsdp2_colocate/dapo_7b_math_fsdp2_colocate.sh rename to recipe/fully_async_policy/exp/qwen2-7B-math_64/fsdp2_colocate/run.sh diff --git a/recipe/fully_async_policy/exp/qwen2-7B-math_64/fsdp2_fully-async_24-40/dapo_qwen2-7B-math_28k_fsdp2_fully-async_24-40_mbs32_tfq6.sh b/recipe/fully_async_policy/exp/qwen2-7B-math_64/fsdp2_fully-async_24-40/run.sh similarity index 99% rename from recipe/fully_async_policy/exp/qwen2-7B-math_64/fsdp2_fully-async_24-40/dapo_qwen2-7B-math_28k_fsdp2_fully-async_24-40_mbs32_tfq6.sh rename to recipe/fully_async_policy/exp/qwen2-7B-math_64/fsdp2_fully-async_24-40/run.sh index 5da2116ef80..3d56ea8b403 100644 --- a/recipe/fully_async_policy/exp/qwen2-7B-math_64/fsdp2_fully-async_24-40/dapo_qwen2-7B-math_28k_fsdp2_fully-async_24-40_mbs32_tfq6.sh +++ b/recipe/fully_async_policy/exp/qwen2-7B-math_64/fsdp2_fully-async_24-40/run.sh @@ -80,7 +80,7 @@ n_resp_per_prompt=16 train_prompt_mini_bsz=32 total_rollout_steps=$(((512*400))) test_freq=20 -staleness_threshold=1 +staleness_threshold=0.1 trigger_parameter_sync_step=6 partial_rollout=True diff --git a/recipe/fully_async_policy/exp/qwen2-7B-math_64/fsdp2_fully-async_32-32/dapo_qwen2-7B-math_28k_fsdp2_fully-async_32-32_mbs32_tfq8.sh b/recipe/fully_async_policy/exp/qwen2-7B-math_64/fsdp2_fully-async_32-32/run.sh similarity index 99% rename from recipe/fully_async_policy/exp/qwen2-7B-math_64/fsdp2_fully-async_32-32/dapo_qwen2-7B-math_28k_fsdp2_fully-async_32-32_mbs32_tfq8.sh rename to recipe/fully_async_policy/exp/qwen2-7B-math_64/fsdp2_fully-async_32-32/run.sh index 221d3c4d5a6..cc26be4f100 100644 --- a/recipe/fully_async_policy/exp/qwen2-7B-math_64/fsdp2_fully-async_32-32/dapo_qwen2-7B-math_28k_fsdp2_fully-async_32-32_mbs32_tfq8.sh +++ b/recipe/fully_async_policy/exp/qwen2-7B-math_64/fsdp2_fully-async_32-32/run.sh @@ -80,7 +80,7 @@ n_resp_per_prompt=16 train_prompt_mini_bsz=32 total_rollout_steps=$(((512*400))) test_freq=20 -staleness_threshold=1 +staleness_threshold=0.1 trigger_parameter_sync_step=8 partial_rollout=True diff --git a/recipe/fully_async_policy/exp/qwen2-7B-math_64/fsdp2_fully-async_40-24/dapo_qwen2-7B-math_28k_fsdp2_fully-async_40-24_mbs32_tfq11.sh b/recipe/fully_async_policy/exp/qwen2-7B-math_64/fsdp2_fully-async_40-24/run.sh similarity index 99% rename from recipe/fully_async_policy/exp/qwen2-7B-math_64/fsdp2_fully-async_40-24/dapo_qwen2-7B-math_28k_fsdp2_fully-async_40-24_mbs32_tfq11.sh rename to recipe/fully_async_policy/exp/qwen2-7B-math_64/fsdp2_fully-async_40-24/run.sh index a15cf990bd1..0a67a563819 100644 --- a/recipe/fully_async_policy/exp/qwen2-7B-math_64/fsdp2_fully-async_40-24/dapo_qwen2-7B-math_28k_fsdp2_fully-async_40-24_mbs32_tfq11.sh +++ b/recipe/fully_async_policy/exp/qwen2-7B-math_64/fsdp2_fully-async_40-24/run.sh @@ -80,7 +80,7 @@ n_resp_per_prompt=16 train_prompt_mini_bsz=32 total_rollout_steps=$(((512*400))) test_freq=20 -staleness_threshold=1 +staleness_threshold=0.1 trigger_parameter_sync_step=11 partial_rollout=True diff --git a/recipe/fully_async_policy/exp/qwen2-7B-math_64/megatron_colocate/dapo_7b_math_megatron_colocate.sh b/recipe/fully_async_policy/exp/qwen2-7B-math_64/megatron_colocate/run.sh similarity index 100% rename from recipe/fully_async_policy/exp/qwen2-7B-math_64/megatron_colocate/dapo_7b_math_megatron_colocate.sh rename to recipe/fully_async_policy/exp/qwen2-7B-math_64/megatron_colocate/run.sh diff --git a/recipe/fully_async_policy/exp/qwen3-30BA3B_128/fsdp2_colocate/test_dapo_qwen3_30b_math.sh b/recipe/fully_async_policy/exp/qwen3-30BA3B_128/fsdp2_colocate/run.sh similarity index 100% rename from recipe/fully_async_policy/exp/qwen3-30BA3B_128/fsdp2_colocate/test_dapo_qwen3_30b_math.sh rename to recipe/fully_async_policy/exp/qwen3-30BA3B_128/fsdp2_colocate/run.sh diff --git a/recipe/fully_async_policy/exp/qwen3-30BA3B_128/megatron_colocate/early_megatron_colocate.sh b/recipe/fully_async_policy/exp/qwen3-30BA3B_128/megatron_colocate/run.sh similarity index 100% rename from recipe/fully_async_policy/exp/qwen3-30BA3B_128/megatron_colocate/early_megatron_colocate.sh rename to recipe/fully_async_policy/exp/qwen3-30BA3B_128/megatron_colocate/run.sh diff --git a/recipe/fully_async_policy/exp/qwen3-30BA3B_32/megatron_colocate/early_megatron_colocate.sh b/recipe/fully_async_policy/exp/qwen3-30BA3B_32/megatron_colocate/early_megatron_colocate.sh deleted file mode 100644 index b2d735f8704..00000000000 --- a/recipe/fully_async_policy/exp/qwen3-30BA3B_32/megatron_colocate/early_megatron_colocate.sh +++ /dev/null @@ -1,161 +0,0 @@ -#!/usr/bin/env bash -set -xeuo pipefail - -project_name='DAPO' -exp_name='dapo_qwen3-30BA3B_32k_megatron_colocate_32_mbs32' - -adv_estimator=grpo - -use_kl_in_reward=False -kl_coef=0.0 -use_kl_loss=False -kl_loss_coef=0.0 - -clip_ratio_low=0.2 -clip_ratio_high=0.28 - -max_prompt_length=$((1024 * 2)) -max_response_length=$((1024 * 32)) -enable_overlong_buffer=True -overlong_buffer_len=$((1024 * 4)) -overlong_penalty_factor=1.0 - -loss_agg_mode="token-mean" - -train_prompt_bsz=512 -train_prompt_mini_bsz=32 -n_resp_per_prompt=16 - -NNODES=${NNODES:-4} -NGPUS_PER_NODE=${NGPUS_PER_NODE:-8} -# Paths -RAY_DATA_HOME=${RAY_DATA_HOME:-"${HOME}/verl"} -MODEL_PATH=/mnt/dolphinfs/ssd_pool/docker/user/hadoop-friday-studio/.friday/models/basemodel/Qwen/Qwen3-30B-A3B -CKPTS_DIR=./ckpts/${project_name}/${exp_name} -TRAIN_FILE=/mnt/dolphinfs/ssd_pool/docker/user/hadoop-friday-studio/FTI/houzhenggang/data/dapo/dapo-math-17k.parquet -TEST_FILE=/mnt/dolphinfs/ssd_pool/docker/user/hadoop-friday-studio/FTI/houzhenggang/data/dapo/aime-2024.parquet - -# Algorithm -temperature=1.0 -top_p=1.0 -top_k=-1 # 0 for HF rollout, -1 for vLLM rollout -val_top_p=0.7 - - -# Performance Related Parameter -use_dynamic_bsz=True -actor_ppo_max_token_len=$((max_prompt_length + max_response_length)) -infer_ppo_max_token_len=$((max_prompt_length + max_response_length)) -offload=True -gen_tp=4 -train_tp=1 -train_pp=1 -EP=8 -ETP=1 -CP=1 - -python3 -m verl.trainer.main_ppo \ - --config-path=config \ - --config-name='ppo_megatron_trainer.yaml' \ - data.train_files="${TRAIN_FILE}" \ - data.val_files="${TEST_FILE}" \ - data.prompt_key=prompt \ - data.truncation='left' \ - data.max_prompt_length=${max_prompt_length} \ - data.max_response_length=${max_response_length} \ - data.train_batch_size=${train_prompt_bsz} \ - actor_rollout_ref.rollout.n=${n_resp_per_prompt} \ - algorithm.adv_estimator=${adv_estimator} \ - algorithm.use_kl_in_reward=${use_kl_in_reward} \ - algorithm.kl_ctrl.kl_coef=${kl_coef} \ - actor_rollout_ref.actor.use_kl_loss=${use_kl_loss} \ - actor_rollout_ref.actor.kl_loss_coef=${kl_loss_coef} \ - actor_rollout_ref.actor.clip_ratio_low=${clip_ratio_low} \ - actor_rollout_ref.actor.clip_ratio_high=${clip_ratio_high} \ - actor_rollout_ref.actor.clip_ratio_c=10.0 \ - actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=2 \ - actor_rollout_ref.ref.log_prob_micro_batch_size_per_gpu=4 \ - actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu=4 \ - actor_rollout_ref.model.path="${MODEL_PATH}" \ - actor_rollout_ref.actor.optim.lr=1e-6 \ - actor_rollout_ref.actor.optim.lr_warmup_steps=10 \ - actor_rollout_ref.actor.optim.weight_decay=0.1 \ - actor_rollout_ref.actor.ppo_mini_batch_size=${train_prompt_mini_bsz} \ - actor_rollout_ref.actor.entropy_coeff=0 \ - actor_rollout_ref.actor.optim.clip_grad=1.0 \ - actor_rollout_ref.actor.loss_agg_mode=${loss_agg_mode} \ - actor_rollout_ref.rollout.gpu_memory_utilization=0.80 \ - actor_rollout_ref.rollout.tensor_model_parallel_size=${gen_tp} \ - actor_rollout_ref.rollout.enable_chunked_prefill=True \ - actor_rollout_ref.rollout.max_num_batched_tokens=$((max_prompt_length + max_response_length)) \ - actor_rollout_ref.rollout.temperature=${temperature} \ - actor_rollout_ref.rollout.top_p=${top_p} \ - actor_rollout_ref.rollout.top_k=${top_k} \ - actor_rollout_ref.rollout.val_kwargs.temperature=${temperature} \ - actor_rollout_ref.rollout.val_kwargs.top_p=${val_top_p} \ - actor_rollout_ref.rollout.val_kwargs.top_k=${top_k} \ - actor_rollout_ref.rollout.val_kwargs.do_sample=True \ - actor_rollout_ref.rollout.val_kwargs.n=1 \ - reward_model.reward_manager=dapo \ - +reward_model.reward_kwargs.overlong_buffer_cfg.enable=${enable_overlong_buffer} \ - +reward_model.reward_kwargs.overlong_buffer_cfg.len=${overlong_buffer_len} \ - +reward_model.reward_kwargs.overlong_buffer_cfg.penalty_factor=${overlong_penalty_factor} \ - +reward_model.reward_kwargs.overlong_buffer_cfg.log=False \ - +reward_model.reward_kwargs.max_resp_len=${max_response_length} \ - trainer.logger='["console","tensorboard"]' \ - trainer.project_name="${project_name}" \ - trainer.experiment_name="${exp_name}" \ - trainer.n_gpus_per_node="${NGPUS_PER_NODE}" \ - trainer.nnodes="${NNODES}" \ - trainer.val_before_train=True \ - trainer.test_freq=20 \ - trainer.save_freq=-1 \ - trainer.total_epochs=10 \ - trainer.total_training_steps=400 \ - trainer.default_local_dir="${CKPTS_DIR}" \ - trainer.resume_mode=auto \ - trainer.log_val_generations=10 \ - critic.strategy=megatron \ - actor_rollout_ref.actor.strategy=megatron \ - actor_rollout_ref.actor.megatron.param_offload=${offload} \ - actor_rollout_ref.actor.megatron.optimizer_offload=${offload} \ - actor_rollout_ref.actor.megatron.grad_offload=${offload} \ - actor_rollout_ref.actor.megatron.pipeline_model_parallel_size=${train_pp} \ - actor_rollout_ref.actor.megatron.tensor_model_parallel_size=${train_tp} \ - actor_rollout_ref.actor.megatron.expert_model_parallel_size=${EP} \ - actor_rollout_ref.actor.megatron.expert_tensor_parallel_size=${ETP} \ - actor_rollout_ref.actor.megatron.context_parallel_size=${CP} \ - +actor_rollout_ref.actor.megatron.override_transformer_config.apply_rope_fusion=True \ - +actor_rollout_ref.actor.megatron.override_transformer_config.moe_router_dtype=fp32 \ - +actor_rollout_ref.actor.megatron.override_transformer_config.moe_shared_expert_overlap=False \ - +actor_rollout_ref.actor.megatron.override_transformer_config.moe_enable_deepep=True \ - +actor_rollout_ref.actor.megatron.override_transformer_config.moe_token_dispatcher_type=flex \ - +actor_rollout_ref.actor.megatron.override_transformer_config.moe_permute_fusion=True \ - +actor_rollout_ref.actor.megatron.override_transformer_config.recompute_granularity=selective \ - +actor_rollout_ref.actor.megatron.override_transformer_config.recompute_modules=["core_attn","moe_act","layernorm","mlp","moe"] \ - +actor_rollout_ref.actor.megatron.override_transformer_config.gradient_accumulation_fusion=True \ - +actor_rollout_ref.actor.megatron.override_transformer_config.account_for_embedding_in_pipeline_split=False \ - +actor_rollout_ref.actor.megatron.override_transformer_config.account_for_loss_in_pipeline_split=False \ - actor_rollout_ref.ref.megatron.param_offload=${offload} \ - actor_rollout_ref.ref.megatron.pipeline_model_parallel_size=${train_pp} \ - actor_rollout_ref.ref.megatron.tensor_model_parallel_size=${train_tp} \ - actor_rollout_ref.ref.megatron.expert_model_parallel_size=${EP} \ - actor_rollout_ref.ref.megatron.expert_tensor_parallel_size=${ETP} \ - actor_rollout_ref.ref.megatron.context_parallel_size=${CP} \ - actor_rollout_ref.actor.megatron.use_mbridge=True - - # +actor_rollout_ref.actor.megatron.override_transformer_config.apply_rope_fusion=False \ - # +actor_rollout_ref.actor.megatron.override_transformer_config.moe_router_dtype=fp32 \ - # +actor_rollout_ref.actor.megatron.override_transformer_config.moe_shared_expert_overlap=False \ - # +actor_rollout_ref.actor.megatron.override_transformer_config.moe_enable_deepep=False \ - # +actor_rollout_ref.actor.megatron.override_transformer_config.moe_token_dispatcher_type=alltoall \ - # +actor_rollout_ref.actor.megatron.override_transformer_config.recompute_method=uniform \ - # +actor_rollout_ref.actor.megatron.override_transformer_config.recompute_granularity=full \ - # actor_rollout_ref.actor.megatron.override_transformer_config.recompute_granularity="selective" \ - # actor_rollout_ref.actor.megatron.override_transformer_config.recompute_modules=["core_attn","moe_act","layernorm","mlp","moe"] \ - # +actor_rollout_ref.actor.megatron.override_transformer_config.recompute_num_layers=1 \ - # +actor_rollout_ref.actor.megatron.override_transformer_config.gradient_accumulation_fusion=True \ - # +actor_rollout_ref.actor.megatron.override_transformer_config.moe_permute_fusion=True \ - # +actor_rollout_ref.actor.megatron.override_transformer_config.account_for_embedding_in_pipeline_split=False \ - # +actor_rollout_ref.actor.megatron.override_transformer_config.account_for_loss_in_pipeline_split=False \ - # +actor_rollout_ref.actor.megatron.override_transformer_config.num_layers_in_last_pipeline_stage=${last_layer} \ \ No newline at end of file diff --git a/recipe/fully_async_policy/exp/qwen3-30BA3B_32/megatron_colocate/runtime_env.yaml b/recipe/fully_async_policy/exp/qwen3-30BA3B_32/megatron_colocate/runtime_env.yaml deleted file mode 100644 index 052557120ad..00000000000 --- a/recipe/fully_async_policy/exp/qwen3-30BA3B_32/megatron_colocate/runtime_env.yaml +++ /dev/null @@ -1,5 +0,0 @@ -env_vars: - TENSORBOARD_DIR: "/mnt/dolphinfs/ssd_pool/docker/user/hadoop-friday-studio/FTI/houzhenggang/expeirments/qwen3-30BA3B-32/dapo_qwen3-30BA3B_32k_megatron_colocate_32_mbs32" - HYDRA_FULL_ERROR: "1" - TORCH_NCCL_AVOID_RECORD_STREAMS: "1" - CUDA_DEVICE_MAX_CONNECTIONS: "1" \ No newline at end of file diff --git a/recipe/fully_async_policy/exp/qwen3-30BA3B_64/megatron_colocate/early_megatron_colocate.sh b/recipe/fully_async_policy/exp/qwen3-30BA3B_64/megatron_colocate/early_megatron_colocate.sh deleted file mode 100644 index 336d105cc5c..00000000000 --- a/recipe/fully_async_policy/exp/qwen3-30BA3B_64/megatron_colocate/early_megatron_colocate.sh +++ /dev/null @@ -1,161 +0,0 @@ -#!/usr/bin/env bash -set -xeuo pipefail - -project_name='DAPO' -exp_name='dapo_qwen3-30BA3B_32k_megatron_colocate_64_mbs32' - -adv_estimator=grpo - -use_kl_in_reward=False -kl_coef=0.0 -use_kl_loss=False -kl_loss_coef=0.0 - -clip_ratio_low=0.2 -clip_ratio_high=0.28 - -max_prompt_length=$((1024 * 2)) -max_response_length=$((1024 * 32)) -enable_overlong_buffer=True -overlong_buffer_len=$((1024 * 4)) -overlong_penalty_factor=1.0 - -loss_agg_mode="token-mean" - -train_prompt_bsz=512 -train_prompt_mini_bsz=32 -n_resp_per_prompt=16 - -NNODES=${NNODES:-8} -NGPUS_PER_NODE=${NGPUS_PER_NODE:-8} -# Paths -RAY_DATA_HOME=${RAY_DATA_HOME:-"${HOME}/verl"} -MODEL_PATH=/mnt/dolphinfs/ssd_pool/docker/user/hadoop-friday-studio/.friday/models/basemodel/Qwen/Qwen3-30B-A3B -CKPTS_DIR=./ckpts/${project_name}/${exp_name} -TRAIN_FILE=/mnt/dolphinfs/ssd_pool/docker/user/hadoop-friday-studio/FTI/houzhenggang/data/dapo/dapo-math-17k.parquet -TEST_FILE=/mnt/dolphinfs/ssd_pool/docker/user/hadoop-friday-studio/FTI/houzhenggang/data/dapo/aime-2024.parquet - -# Algorithm -temperature=1.0 -top_p=1.0 -top_k=-1 # 0 for HF rollout, -1 for vLLM rollout -val_top_p=0.7 - - -# Performance Related Parameter -use_dynamic_bsz=True -actor_ppo_max_token_len=$((max_prompt_length + max_response_length)) -infer_ppo_max_token_len=$((max_prompt_length + max_response_length)) -offload=True -gen_tp=4 -train_tp=1 -train_pp=1 -EP=8 -ETP=1 -CP=1 - -python3 -m verl.trainer.main_ppo \ - --config-path=config \ - --config-name='ppo_megatron_trainer.yaml' \ - data.train_files="${TRAIN_FILE}" \ - data.val_files="${TEST_FILE}" \ - data.prompt_key=prompt \ - data.truncation='left' \ - data.max_prompt_length=${max_prompt_length} \ - data.max_response_length=${max_response_length} \ - data.train_batch_size=${train_prompt_bsz} \ - actor_rollout_ref.rollout.n=${n_resp_per_prompt} \ - algorithm.adv_estimator=${adv_estimator} \ - algorithm.use_kl_in_reward=${use_kl_in_reward} \ - algorithm.kl_ctrl.kl_coef=${kl_coef} \ - actor_rollout_ref.actor.use_kl_loss=${use_kl_loss} \ - actor_rollout_ref.actor.kl_loss_coef=${kl_loss_coef} \ - actor_rollout_ref.actor.clip_ratio_low=${clip_ratio_low} \ - actor_rollout_ref.actor.clip_ratio_high=${clip_ratio_high} \ - actor_rollout_ref.actor.clip_ratio_c=10.0 \ - actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=2 \ - actor_rollout_ref.ref.log_prob_micro_batch_size_per_gpu=4 \ - actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu=4 \ - actor_rollout_ref.model.path="${MODEL_PATH}" \ - actor_rollout_ref.actor.optim.lr=1e-6 \ - actor_rollout_ref.actor.optim.lr_warmup_steps=10 \ - actor_rollout_ref.actor.optim.weight_decay=0.1 \ - actor_rollout_ref.actor.ppo_mini_batch_size=${train_prompt_mini_bsz} \ - actor_rollout_ref.actor.entropy_coeff=0 \ - actor_rollout_ref.actor.optim.clip_grad=1.0 \ - actor_rollout_ref.actor.loss_agg_mode=${loss_agg_mode} \ - actor_rollout_ref.rollout.gpu_memory_utilization=0.80 \ - actor_rollout_ref.rollout.tensor_model_parallel_size=${gen_tp} \ - actor_rollout_ref.rollout.enable_chunked_prefill=True \ - actor_rollout_ref.rollout.max_num_batched_tokens=$((max_prompt_length + max_response_length)) \ - actor_rollout_ref.rollout.temperature=${temperature} \ - actor_rollout_ref.rollout.top_p=${top_p} \ - actor_rollout_ref.rollout.top_k=${top_k} \ - actor_rollout_ref.rollout.val_kwargs.temperature=${temperature} \ - actor_rollout_ref.rollout.val_kwargs.top_p=${val_top_p} \ - actor_rollout_ref.rollout.val_kwargs.top_k=${top_k} \ - actor_rollout_ref.rollout.val_kwargs.do_sample=True \ - actor_rollout_ref.rollout.val_kwargs.n=1 \ - reward_model.reward_manager=dapo \ - +reward_model.reward_kwargs.overlong_buffer_cfg.enable=${enable_overlong_buffer} \ - +reward_model.reward_kwargs.overlong_buffer_cfg.len=${overlong_buffer_len} \ - +reward_model.reward_kwargs.overlong_buffer_cfg.penalty_factor=${overlong_penalty_factor} \ - +reward_model.reward_kwargs.overlong_buffer_cfg.log=False \ - +reward_model.reward_kwargs.max_resp_len=${max_response_length} \ - trainer.logger='["console","tensorboard"]' \ - trainer.project_name="${project_name}" \ - trainer.experiment_name="${exp_name}" \ - trainer.n_gpus_per_node="${NGPUS_PER_NODE}" \ - trainer.nnodes="${NNODES}" \ - trainer.val_before_train=True \ - trainer.test_freq=20 \ - trainer.save_freq=-1 \ - trainer.total_epochs=10 \ - trainer.total_training_steps=400 \ - trainer.default_local_dir="${CKPTS_DIR}" \ - trainer.resume_mode=auto \ - trainer.log_val_generations=10 \ - critic.strategy=megatron \ - actor_rollout_ref.actor.strategy=megatron \ - actor_rollout_ref.actor.megatron.param_offload=${offload} \ - actor_rollout_ref.actor.megatron.optimizer_offload=${offload} \ - actor_rollout_ref.actor.megatron.grad_offload=${offload} \ - actor_rollout_ref.actor.megatron.pipeline_model_parallel_size=${train_pp} \ - actor_rollout_ref.actor.megatron.tensor_model_parallel_size=${train_tp} \ - actor_rollout_ref.actor.megatron.expert_model_parallel_size=${EP} \ - actor_rollout_ref.actor.megatron.expert_tensor_parallel_size=${ETP} \ - actor_rollout_ref.actor.megatron.context_parallel_size=${CP} \ - +actor_rollout_ref.actor.megatron.override_transformer_config.apply_rope_fusion=True \ - +actor_rollout_ref.actor.megatron.override_transformer_config.moe_router_dtype=fp32 \ - +actor_rollout_ref.actor.megatron.override_transformer_config.moe_shared_expert_overlap=False \ - +actor_rollout_ref.actor.megatron.override_transformer_config.moe_enable_deepep=True \ - +actor_rollout_ref.actor.megatron.override_transformer_config.moe_token_dispatcher_type=flex \ - +actor_rollout_ref.actor.megatron.override_transformer_config.moe_permute_fusion=True \ - +actor_rollout_ref.actor.megatron.override_transformer_config.recompute_granularity=selective \ - +actor_rollout_ref.actor.megatron.override_transformer_config.recompute_modules=["core_attn","moe_act","layernorm","mlp","moe"] \ - +actor_rollout_ref.actor.megatron.override_transformer_config.gradient_accumulation_fusion=True \ - +actor_rollout_ref.actor.megatron.override_transformer_config.account_for_embedding_in_pipeline_split=False \ - +actor_rollout_ref.actor.megatron.override_transformer_config.account_for_loss_in_pipeline_split=False \ - actor_rollout_ref.ref.megatron.param_offload=${offload} \ - actor_rollout_ref.ref.megatron.pipeline_model_parallel_size=${train_pp} \ - actor_rollout_ref.ref.megatron.tensor_model_parallel_size=${train_tp} \ - actor_rollout_ref.ref.megatron.expert_model_parallel_size=${EP} \ - actor_rollout_ref.ref.megatron.expert_tensor_parallel_size=${ETP} \ - actor_rollout_ref.ref.megatron.context_parallel_size=${CP} \ - actor_rollout_ref.actor.megatron.use_mbridge=True - - # +actor_rollout_ref.actor.megatron.override_transformer_config.apply_rope_fusion=False \ - # +actor_rollout_ref.actor.megatron.override_transformer_config.moe_router_dtype=fp32 \ - # +actor_rollout_ref.actor.megatron.override_transformer_config.moe_shared_expert_overlap=False \ - # +actor_rollout_ref.actor.megatron.override_transformer_config.moe_enable_deepep=False \ - # +actor_rollout_ref.actor.megatron.override_transformer_config.moe_token_dispatcher_type=alltoall \ - # +actor_rollout_ref.actor.megatron.override_transformer_config.recompute_method=uniform \ - # +actor_rollout_ref.actor.megatron.override_transformer_config.recompute_granularity=full \ - # actor_rollout_ref.actor.megatron.override_transformer_config.recompute_granularity="selective" \ - # actor_rollout_ref.actor.megatron.override_transformer_config.recompute_modules=["core_attn","moe_act","layernorm","mlp","moe"] \ - # +actor_rollout_ref.actor.megatron.override_transformer_config.recompute_num_layers=1 \ - # +actor_rollout_ref.actor.megatron.override_transformer_config.gradient_accumulation_fusion=True \ - # +actor_rollout_ref.actor.megatron.override_transformer_config.moe_permute_fusion=True \ - # +actor_rollout_ref.actor.megatron.override_transformer_config.account_for_embedding_in_pipeline_split=False \ - # +actor_rollout_ref.actor.megatron.override_transformer_config.account_for_loss_in_pipeline_split=False \ - # +actor_rollout_ref.actor.megatron.override_transformer_config.num_layers_in_last_pipeline_stage=${last_layer} \ \ No newline at end of file diff --git a/recipe/fully_async_policy/exp/qwen3-30BA3B_64/megatron_colocate/runtime_env.yaml b/recipe/fully_async_policy/exp/qwen3-30BA3B_64/megatron_colocate/runtime_env.yaml deleted file mode 100644 index 3a497e90dd0..00000000000 --- a/recipe/fully_async_policy/exp/qwen3-30BA3B_64/megatron_colocate/runtime_env.yaml +++ /dev/null @@ -1,5 +0,0 @@ -env_vars: - TENSORBOARD_DIR: "/mnt/dolphinfs/ssd_pool/docker/user/hadoop-friday-studio/FTI/houzhenggang/expeirments/qwen3-30BA3B-64/dapo_qwen3-30BA3B_32k_megatron_colocate_64_mbs32" - HYDRA_FULL_ERROR: "1" - TORCH_NCCL_AVOID_RECORD_STREAMS: "1" - CUDA_DEVICE_MAX_CONNECTIONS: "1" \ No newline at end of file diff --git a/recipe/fully_async_policy/exp/qwen3-32B_128/fsdp2_colocate/fsdp2.sh b/recipe/fully_async_policy/exp/qwen3-32B_128/fsdp2_colocate/run.sh similarity index 100% rename from recipe/fully_async_policy/exp/qwen3-32B_128/fsdp2_colocate/fsdp2.sh rename to recipe/fully_async_policy/exp/qwen3-32B_128/fsdp2_colocate/run.sh diff --git a/recipe/fully_async_policy/exp/qwen3-32B_128/megatron_colocate/early_megatron_colocate.sh b/recipe/fully_async_policy/exp/qwen3-32B_128/megatron_colocate/run.sh similarity index 100% rename from recipe/fully_async_policy/exp/qwen3-32B_128/megatron_colocate/early_megatron_colocate.sh rename to recipe/fully_async_policy/exp/qwen3-32B_128/megatron_colocate/run.sh diff --git a/recipe/fully_async_policy/exp/qwen3-32B_32/megatron_colocate/early_megatron_colocate.sh b/recipe/fully_async_policy/exp/qwen3-32B_32/megatron_colocate/early_megatron_colocate.sh deleted file mode 100644 index 085c7231c59..00000000000 --- a/recipe/fully_async_policy/exp/qwen3-32B_32/megatron_colocate/early_megatron_colocate.sh +++ /dev/null @@ -1,156 +0,0 @@ -#!/usr/bin/env bash -set -xeuo pipefail - -project_name='DAPO' -exp_name='dapo_qwen3-32B_32k_megatron_colocate_32_mbs32' - -adv_estimator=grpo - -use_kl_in_reward=False -kl_coef=0.0 -use_kl_loss=False -kl_loss_coef=0.0 - -clip_ratio_low=0.2 -clip_ratio_high=0.28 - -max_prompt_length=$((1024 * 2)) -max_response_length=$((1024 * 32)) -enable_overlong_buffer=True -overlong_buffer_len=$((1024 * 4)) -overlong_penalty_factor=1.0 - -loss_agg_mode="token-mean" - -train_prompt_bsz=512 -train_prompt_mini_bsz=32 -n_resp_per_prompt=16 - -NNODES=${NNODES:-4} -NGPUS_PER_NODE=${NGPUS_PER_NODE:-8} -# Paths -RAY_DATA_HOME=${RAY_DATA_HOME:-"${HOME}/verl"} -MODEL_PATH=/mnt/dolphinfs/ssd_pool/docker/user/hadoop-friday-studio/.friday/models/basemodel/Qwen/Qwen3-32B -CKPTS_DIR=./ckpts/${project_name}/${exp_name} -TRAIN_FILE=/mnt/dolphinfs/ssd_pool/docker/user/hadoop-friday-studio/FTI/houzhenggang/data/dapo/dapo-math-17k.parquet -TEST_FILE=/mnt/dolphinfs/ssd_pool/docker/user/hadoop-friday-studio/FTI/houzhenggang/data/dapo/aime-2024.parquet - -# Algorithm -temperature=1.0 -top_p=1.0 -top_k=-1 # 0 for HF rollout, -1 for vLLM rollout -val_top_p=0.7 - - -# Performance Related Parameter -use_dynamic_bsz=True -actor_ppo_max_token_len=$((max_prompt_length + max_response_length)) -infer_ppo_max_token_len=$((max_prompt_length + max_response_length)) -offload=True -gen_tp=4 -train_tp=4 -train_pp=2 -EP=1 -ETP=1 -CP=1 - -python3 -m verl.trainer.main_ppo \ - --config-path=config \ - --config-name='ppo_megatron_trainer.yaml' \ - data.train_files="${TRAIN_FILE}" \ - data.val_files="${TEST_FILE}" \ - data.prompt_key=prompt \ - data.truncation='left' \ - data.max_prompt_length=${max_prompt_length} \ - data.max_response_length=${max_response_length} \ - data.train_batch_size=${train_prompt_bsz} \ - actor_rollout_ref.rollout.n=${n_resp_per_prompt} \ - algorithm.adv_estimator=${adv_estimator} \ - algorithm.use_kl_in_reward=${use_kl_in_reward} \ - algorithm.kl_ctrl.kl_coef=${kl_coef} \ - actor_rollout_ref.actor.use_kl_loss=${use_kl_loss} \ - actor_rollout_ref.actor.kl_loss_coef=${kl_loss_coef} \ - actor_rollout_ref.actor.clip_ratio_low=${clip_ratio_low} \ - actor_rollout_ref.actor.clip_ratio_high=${clip_ratio_high} \ - actor_rollout_ref.actor.clip_ratio_c=10.0 \ - actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=2 \ - actor_rollout_ref.ref.log_prob_micro_batch_size_per_gpu=4 \ - actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu=4 \ - actor_rollout_ref.model.path="${MODEL_PATH}" \ - actor_rollout_ref.actor.optim.lr=1e-6 \ - actor_rollout_ref.actor.optim.lr_warmup_steps=10 \ - actor_rollout_ref.actor.optim.weight_decay=0.1 \ - actor_rollout_ref.actor.ppo_mini_batch_size=${train_prompt_mini_bsz} \ - actor_rollout_ref.actor.entropy_coeff=0 \ - actor_rollout_ref.actor.optim.clip_grad=1.0 \ - actor_rollout_ref.actor.loss_agg_mode=${loss_agg_mode} \ - actor_rollout_ref.rollout.gpu_memory_utilization=0.80 \ - actor_rollout_ref.rollout.tensor_model_parallel_size=${gen_tp} \ - actor_rollout_ref.rollout.enable_chunked_prefill=True \ - actor_rollout_ref.rollout.max_num_batched_tokens=$((max_prompt_length + max_response_length)) \ - actor_rollout_ref.rollout.temperature=${temperature} \ - actor_rollout_ref.rollout.top_p=${top_p} \ - actor_rollout_ref.rollout.top_k=${top_k} \ - actor_rollout_ref.rollout.val_kwargs.temperature=${temperature} \ - actor_rollout_ref.rollout.val_kwargs.top_p=${val_top_p} \ - actor_rollout_ref.rollout.val_kwargs.top_k=${top_k} \ - actor_rollout_ref.rollout.val_kwargs.do_sample=True \ - actor_rollout_ref.rollout.val_kwargs.n=1 \ - reward_model.reward_manager=dapo \ - +reward_model.reward_kwargs.overlong_buffer_cfg.enable=${enable_overlong_buffer} \ - +reward_model.reward_kwargs.overlong_buffer_cfg.len=${overlong_buffer_len} \ - +reward_model.reward_kwargs.overlong_buffer_cfg.penalty_factor=${overlong_penalty_factor} \ - +reward_model.reward_kwargs.overlong_buffer_cfg.log=False \ - +reward_model.reward_kwargs.max_resp_len=${max_response_length} \ - trainer.logger='["console","tensorboard"]' \ - trainer.project_name="${project_name}" \ - trainer.experiment_name="${exp_name}" \ - trainer.n_gpus_per_node="${NGPUS_PER_NODE}" \ - trainer.nnodes="${NNODES}" \ - trainer.val_before_train=True \ - trainer.test_freq=20 \ - trainer.save_freq=-1 \ - trainer.total_epochs=10 \ - trainer.total_training_steps=400 \ - trainer.default_local_dir="${CKPTS_DIR}" \ - trainer.resume_mode=auto \ - trainer.log_val_generations=10 \ - critic.strategy=megatron \ - actor_rollout_ref.actor.strategy=megatron \ - actor_rollout_ref.actor.megatron.param_offload=${offload} \ - actor_rollout_ref.actor.megatron.optimizer_offload=${offload} \ - actor_rollout_ref.actor.megatron.grad_offload=${offload} \ - actor_rollout_ref.actor.megatron.pipeline_model_parallel_size=${train_pp} \ - actor_rollout_ref.actor.megatron.tensor_model_parallel_size=${train_tp} \ - actor_rollout_ref.actor.megatron.expert_model_parallel_size=${EP} \ - actor_rollout_ref.actor.megatron.expert_tensor_parallel_size=${ETP} \ - actor_rollout_ref.actor.megatron.context_parallel_size=${CP} \ - +actor_rollout_ref.actor.megatron.override_transformer_config.apply_rope_fusion=True \ - +actor_rollout_ref.actor.megatron.override_transformer_config.recompute_granularity=selective \ - +actor_rollout_ref.actor.megatron.override_transformer_config.recompute_modules=["core_attn","layernorm","mlp"] \ - +actor_rollout_ref.actor.megatron.override_transformer_config.gradient_accumulation_fusion=True \ - +actor_rollout_ref.actor.megatron.override_transformer_config.account_for_embedding_in_pipeline_split=False \ - +actor_rollout_ref.actor.megatron.override_transformer_config.account_for_loss_in_pipeline_split=False \ - actor_rollout_ref.ref.megatron.param_offload=${offload} \ - actor_rollout_ref.ref.megatron.pipeline_model_parallel_size=${train_pp} \ - actor_rollout_ref.ref.megatron.tensor_model_parallel_size=${train_tp} \ - actor_rollout_ref.ref.megatron.expert_model_parallel_size=${EP} \ - actor_rollout_ref.ref.megatron.expert_tensor_parallel_size=${ETP} \ - actor_rollout_ref.ref.megatron.context_parallel_size=${CP} \ - actor_rollout_ref.actor.megatron.use_mbridge=True - - # +actor_rollout_ref.actor.megatron.override_transformer_config.apply_rope_fusion=False \ - # +actor_rollout_ref.actor.megatron.override_transformer_config.moe_router_dtype=fp32 \ - # +actor_rollout_ref.actor.megatron.override_transformer_config.moe_shared_expert_overlap=False \ - # +actor_rollout_ref.actor.megatron.override_transformer_config.moe_enable_deepep=False \ - # +actor_rollout_ref.actor.megatron.override_transformer_config.moe_token_dispatcher_type=alltoall \ - # +actor_rollout_ref.actor.megatron.override_transformer_config.recompute_method=uniform \ - # +actor_rollout_ref.actor.megatron.override_transformer_config.recompute_granularity=full \ - # actor_rollout_ref.actor.megatron.override_transformer_config.recompute_granularity="selective" \ - # actor_rollout_ref.actor.megatron.override_transformer_config.recompute_modules=["core_attn","moe_act","layernorm","mlp","moe"] \ - # +actor_rollout_ref.actor.megatron.override_transformer_config.recompute_num_layers=1 \ - # +actor_rollout_ref.actor.megatron.override_transformer_config.gradient_accumulation_fusion=True \ - # +actor_rollout_ref.actor.megatron.override_transformer_config.moe_permute_fusion=True \ - # +actor_rollout_ref.actor.megatron.override_transformer_config.account_for_embedding_in_pipeline_split=False \ - # +actor_rollout_ref.actor.megatron.override_transformer_config.account_for_loss_in_pipeline_split=False \ - # +actor_rollout_ref.actor.megatron.override_transformer_config.num_layers_in_last_pipeline_stage=${last_layer} \ \ No newline at end of file diff --git a/recipe/fully_async_policy/exp/qwen3-32B_64/megatron_colocate/early_megatron_colocate.sh b/recipe/fully_async_policy/exp/qwen3-32B_64/megatron_colocate/early_megatron_colocate.sh deleted file mode 100644 index 145ea3dbec9..00000000000 --- a/recipe/fully_async_policy/exp/qwen3-32B_64/megatron_colocate/early_megatron_colocate.sh +++ /dev/null @@ -1,156 +0,0 @@ -#!/usr/bin/env bash -set -xeuo pipefail - -project_name='DAPO' -exp_name='dapo_qwen3-32B_32k_megatron_colocate_64_mbs32' - -adv_estimator=grpo - -use_kl_in_reward=False -kl_coef=0.0 -use_kl_loss=False -kl_loss_coef=0.0 - -clip_ratio_low=0.2 -clip_ratio_high=0.28 - -max_prompt_length=$((1024 * 2)) -max_response_length=$((1024 * 32)) -enable_overlong_buffer=True -overlong_buffer_len=$((1024 * 4)) -overlong_penalty_factor=1.0 - -loss_agg_mode="token-mean" - -train_prompt_bsz=512 -train_prompt_mini_bsz=32 -n_resp_per_prompt=16 - -NNODES=${NNODES:-8} -NGPUS_PER_NODE=${NGPUS_PER_NODE:-8} -# Paths -RAY_DATA_HOME=${RAY_DATA_HOME:-"${HOME}/verl"} -MODEL_PATH=/mnt/dolphinfs/ssd_pool/docker/user/hadoop-friday-studio/.friday/models/basemodel/Qwen/Qwen3-32B -CKPTS_DIR=./ckpts/${project_name}/${exp_name} -TRAIN_FILE=/mnt/dolphinfs/ssd_pool/docker/user/hadoop-friday-studio/FTI/houzhenggang/data/dapo/dapo-math-17k.parquet -TEST_FILE=/mnt/dolphinfs/ssd_pool/docker/user/hadoop-friday-studio/FTI/houzhenggang/data/dapo/aime-2024.parquet - -# Algorithm -temperature=1.0 -top_p=1.0 -top_k=-1 # 0 for HF rollout, -1 for vLLM rollout -val_top_p=0.7 - - -# Performance Related Parameter -use_dynamic_bsz=True -actor_ppo_max_token_len=$((max_prompt_length + max_response_length)) -infer_ppo_max_token_len=$((max_prompt_length + max_response_length)) -offload=True -gen_tp=4 -train_tp=4 -train_pp=2 -EP=1 -ETP=1 -CP=1 - -python3 -m verl.trainer.main_ppo \ - --config-path=config \ - --config-name='ppo_megatron_trainer.yaml' \ - data.train_files="${TRAIN_FILE}" \ - data.val_files="${TEST_FILE}" \ - data.prompt_key=prompt \ - data.truncation='left' \ - data.max_prompt_length=${max_prompt_length} \ - data.max_response_length=${max_response_length} \ - data.train_batch_size=${train_prompt_bsz} \ - actor_rollout_ref.rollout.n=${n_resp_per_prompt} \ - algorithm.adv_estimator=${adv_estimator} \ - algorithm.use_kl_in_reward=${use_kl_in_reward} \ - algorithm.kl_ctrl.kl_coef=${kl_coef} \ - actor_rollout_ref.actor.use_kl_loss=${use_kl_loss} \ - actor_rollout_ref.actor.kl_loss_coef=${kl_loss_coef} \ - actor_rollout_ref.actor.clip_ratio_low=${clip_ratio_low} \ - actor_rollout_ref.actor.clip_ratio_high=${clip_ratio_high} \ - actor_rollout_ref.actor.clip_ratio_c=10.0 \ - actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=2 \ - actor_rollout_ref.ref.log_prob_micro_batch_size_per_gpu=4 \ - actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu=4 \ - actor_rollout_ref.model.path="${MODEL_PATH}" \ - actor_rollout_ref.actor.optim.lr=1e-6 \ - actor_rollout_ref.actor.optim.lr_warmup_steps=10 \ - actor_rollout_ref.actor.optim.weight_decay=0.1 \ - actor_rollout_ref.actor.ppo_mini_batch_size=${train_prompt_mini_bsz} \ - actor_rollout_ref.actor.entropy_coeff=0 \ - actor_rollout_ref.actor.optim.clip_grad=1.0 \ - actor_rollout_ref.actor.loss_agg_mode=${loss_agg_mode} \ - actor_rollout_ref.rollout.gpu_memory_utilization=0.80 \ - actor_rollout_ref.rollout.tensor_model_parallel_size=${gen_tp} \ - actor_rollout_ref.rollout.enable_chunked_prefill=True \ - actor_rollout_ref.rollout.max_num_batched_tokens=$((max_prompt_length + max_response_length)) \ - actor_rollout_ref.rollout.temperature=${temperature} \ - actor_rollout_ref.rollout.top_p=${top_p} \ - actor_rollout_ref.rollout.top_k=${top_k} \ - actor_rollout_ref.rollout.val_kwargs.temperature=${temperature} \ - actor_rollout_ref.rollout.val_kwargs.top_p=${val_top_p} \ - actor_rollout_ref.rollout.val_kwargs.top_k=${top_k} \ - actor_rollout_ref.rollout.val_kwargs.do_sample=True \ - actor_rollout_ref.rollout.val_kwargs.n=1 \ - reward_model.reward_manager=dapo \ - +reward_model.reward_kwargs.overlong_buffer_cfg.enable=${enable_overlong_buffer} \ - +reward_model.reward_kwargs.overlong_buffer_cfg.len=${overlong_buffer_len} \ - +reward_model.reward_kwargs.overlong_buffer_cfg.penalty_factor=${overlong_penalty_factor} \ - +reward_model.reward_kwargs.overlong_buffer_cfg.log=False \ - +reward_model.reward_kwargs.max_resp_len=${max_response_length} \ - trainer.logger='["console","tensorboard"]' \ - trainer.project_name="${project_name}" \ - trainer.experiment_name="${exp_name}" \ - trainer.n_gpus_per_node="${NGPUS_PER_NODE}" \ - trainer.nnodes="${NNODES}" \ - trainer.val_before_train=True \ - trainer.test_freq=20 \ - trainer.save_freq=-1 \ - trainer.total_epochs=10 \ - trainer.total_training_steps=400 \ - trainer.default_local_dir="${CKPTS_DIR}" \ - trainer.resume_mode=auto \ - trainer.log_val_generations=10 \ - critic.strategy=megatron \ - actor_rollout_ref.actor.strategy=megatron \ - actor_rollout_ref.actor.megatron.param_offload=${offload} \ - actor_rollout_ref.actor.megatron.optimizer_offload=${offload} \ - actor_rollout_ref.actor.megatron.grad_offload=${offload} \ - actor_rollout_ref.actor.megatron.pipeline_model_parallel_size=${train_pp} \ - actor_rollout_ref.actor.megatron.tensor_model_parallel_size=${train_tp} \ - actor_rollout_ref.actor.megatron.expert_model_parallel_size=${EP} \ - actor_rollout_ref.actor.megatron.expert_tensor_parallel_size=${ETP} \ - actor_rollout_ref.actor.megatron.context_parallel_size=${CP} \ - +actor_rollout_ref.actor.megatron.override_transformer_config.apply_rope_fusion=True \ - +actor_rollout_ref.actor.megatron.override_transformer_config.recompute_granularity=selective \ - +actor_rollout_ref.actor.megatron.override_transformer_config.recompute_modules=["core_attn","layernorm","mlp"] \ - +actor_rollout_ref.actor.megatron.override_transformer_config.gradient_accumulation_fusion=True \ - +actor_rollout_ref.actor.megatron.override_transformer_config.account_for_embedding_in_pipeline_split=False \ - +actor_rollout_ref.actor.megatron.override_transformer_config.account_for_loss_in_pipeline_split=False \ - actor_rollout_ref.ref.megatron.param_offload=${offload} \ - actor_rollout_ref.ref.megatron.pipeline_model_parallel_size=${train_pp} \ - actor_rollout_ref.ref.megatron.tensor_model_parallel_size=${train_tp} \ - actor_rollout_ref.ref.megatron.expert_model_parallel_size=${EP} \ - actor_rollout_ref.ref.megatron.expert_tensor_parallel_size=${ETP} \ - actor_rollout_ref.ref.megatron.context_parallel_size=${CP} \ - actor_rollout_ref.actor.megatron.use_mbridge=True - - # +actor_rollout_ref.actor.megatron.override_transformer_config.apply_rope_fusion=False \ - # +actor_rollout_ref.actor.megatron.override_transformer_config.moe_router_dtype=fp32 \ - # +actor_rollout_ref.actor.megatron.override_transformer_config.moe_shared_expert_overlap=False \ - # +actor_rollout_ref.actor.megatron.override_transformer_config.moe_enable_deepep=False \ - # +actor_rollout_ref.actor.megatron.override_transformer_config.moe_token_dispatcher_type=alltoall \ - # +actor_rollout_ref.actor.megatron.override_transformer_config.recompute_method=uniform \ - # +actor_rollout_ref.actor.megatron.override_transformer_config.recompute_granularity=full \ - # actor_rollout_ref.actor.megatron.override_transformer_config.recompute_granularity="selective" \ - # actor_rollout_ref.actor.megatron.override_transformer_config.recompute_modules=["core_attn","moe_act","layernorm","mlp","moe"] \ - # +actor_rollout_ref.actor.megatron.override_transformer_config.recompute_num_layers=1 \ - # +actor_rollout_ref.actor.megatron.override_transformer_config.gradient_accumulation_fusion=True \ - # +actor_rollout_ref.actor.megatron.override_transformer_config.moe_permute_fusion=True \ - # +actor_rollout_ref.actor.megatron.override_transformer_config.account_for_embedding_in_pipeline_split=False \ - # +actor_rollout_ref.actor.megatron.override_transformer_config.account_for_loss_in_pipeline_split=False \ - # +actor_rollout_ref.actor.megatron.override_transformer_config.num_layers_in_last_pipeline_stage=${last_layer} \ \ No newline at end of file diff --git a/recipe/fully_async_policy/exp/qwen3-32B_64/megatron_colocate/runtime_env.yaml b/recipe/fully_async_policy/exp/qwen3-32B_64/megatron_colocate/runtime_env.yaml deleted file mode 100644 index d3dc7176f0a..00000000000 --- a/recipe/fully_async_policy/exp/qwen3-32B_64/megatron_colocate/runtime_env.yaml +++ /dev/null @@ -1,5 +0,0 @@ -env_vars: - TENSORBOARD_DIR: "/mnt/dolphinfs/ssd_pool/docker/user/hadoop-friday-studio/FTI/houzhenggang/expeirments/qwen3-32B-64/dapo_qwen3-32B_32k_megatron_colocate_64_mbs32" - HYDRA_FULL_ERROR: "1" - TORCH_NCCL_AVOID_RECORD_STREAMS: "1" - CUDA_DEVICE_MAX_CONNECTIONS: "1" \ No newline at end of file diff --git a/recipe/fully_async_policy/shell/dapo_7b_math_fsdp2_2_6.sh b/recipe/fully_async_policy/shell/dapo_7b_math_fsdp2_2_6.sh index 0d303bdde87..10563218878 100644 --- a/recipe/fully_async_policy/shell/dapo_7b_math_fsdp2_2_6.sh +++ b/recipe/fully_async_policy/shell/dapo_7b_math_fsdp2_2_6.sh @@ -78,7 +78,7 @@ n_resp_per_prompt=16 train_prompt_mini_bsz=64 total_rollout_steps=$(((512*100))) test_freq=2 -staleness_threshold=1 +staleness_threshold=0.1 trigger_parameter_sync_step=16 partial_rollout=True diff --git a/recipe/fully_async_policy/shell/dapo_7b_math_fsdp2_4_12.sh b/recipe/fully_async_policy/shell/dapo_7b_math_fsdp2_4_12.sh index 2b4bf9c31fe..fc9b2ad6607 100644 --- a/recipe/fully_async_policy/shell/dapo_7b_math_fsdp2_4_12.sh +++ b/recipe/fully_async_policy/shell/dapo_7b_math_fsdp2_4_12.sh @@ -78,7 +78,7 @@ n_resp_per_prompt=16 train_prompt_mini_bsz=64 total_rollout_steps=$(((512*100))) test_freq=10 -staleness_threshold=1 +staleness_threshold=0.1 trigger_parameter_sync_step=16 partial_rollout=True diff --git a/recipe/fully_async_policy/shell/dapo_7b_math_fsdp2_8_8.sh b/recipe/fully_async_policy/shell/dapo_7b_math_fsdp2_8_8.sh index 688a87fab92..c59877d97f9 100644 --- a/recipe/fully_async_policy/shell/dapo_7b_math_fsdp2_8_8.sh +++ b/recipe/fully_async_policy/shell/dapo_7b_math_fsdp2_8_8.sh @@ -78,7 +78,7 @@ n_resp_per_prompt=16 train_prompt_mini_bsz=64 total_rollout_steps=$(((512*100))) test_freq=10 -staleness_threshold=1 +staleness_threshold=0.1 trigger_parameter_sync_step=16 partial_rollout=True diff --git a/tests/special_e2e/run_fully_async_policy.sh b/tests/special_e2e/run_fully_async_policy.sh index 63cfcf622a6..4813f159696 100644 --- a/tests/special_e2e/run_fully_async_policy.sh +++ b/tests/special_e2e/run_fully_async_policy.sh @@ -58,7 +58,7 @@ n_resp_per_prompt=16 train_prompt_mini_bsz=32 total_rollout_steps=$(((128*2))) test_freq=10 -staleness_threshold=1 +staleness_threshold=0.1 trigger_parameter_sync_step=16 partial_rollout=True From d471890b404dd4ee470ccb8da6ab1fd047422b3e Mon Sep 17 00:00:00 2001 From: wangshulin02 <953550366@qq.com> Date: Wed, 10 Sep 2025 22:04:16 +0800 Subject: [PATCH 124/182] fix last_valid bug, fix staleness_samples reset --- recipe/fully_async_policy/fully_async_rollouter.py | 12 +++++++++--- recipe/fully_async_policy/fully_async_trainer.py | 4 +++- recipe/fully_async_policy/message_queue.py | 5 ++++- recipe/fully_async_policy/param_sync.py | 12 +++++++----- 4 files changed, 23 insertions(+), 10 deletions(-) diff --git a/recipe/fully_async_policy/fully_async_rollouter.py b/recipe/fully_async_policy/fully_async_rollouter.py index 4a2a7d7200c..162836a00f6 100644 --- a/recipe/fully_async_policy/fully_async_rollouter.py +++ b/recipe/fully_async_policy/fully_async_rollouter.py @@ -197,10 +197,16 @@ async def update_param_version(self, version: int, validate: bool = False, globa old_version = self.current_param_version self.current_param_version = version # every time param change, reset staleness_samples - self.staleness_samples = 0 + self.staleness_samples = ( + len(self.active_tasks) + + self.result_queue.qsize() + + self.cancel_queue.qsize() + + await self.message_queue_client.get_queue_size() + ) print( f"[FullyAsyncRollouter][Public][update_param_version] " - f"Parameter version updated from {old_version} to {version}" + f"Parameter version updated from {old_version} to {version} " + f",reset staleness_samples to: {self.staleness_samples}" ) timing_raw = {} if ( @@ -412,7 +418,7 @@ async def _consumer_worker(self): rollout_sample = await self.result_queue.get() rollout_sample = merge_rollout_sample(self.config, self.tokenizer, rollout_sample) - # 直接将 RolloutSample 放入消息队列 + # 将 RolloutSample 放入消息队列 success = await self.message_queue_client.put_sample( sample=ray.cloudpickle.dumps(rollout_sample), param_version=rollout_sample.param_version, diff --git a/recipe/fully_async_policy/fully_async_trainer.py b/recipe/fully_async_policy/fully_async_trainer.py index 0d83a00ba4a..5d2a2c794e8 100644 --- a/recipe/fully_async_policy/fully_async_trainer.py +++ b/recipe/fully_async_policy/fully_async_trainer.py @@ -337,7 +337,9 @@ def fit(self): val_data: ValidateMetrics = ray.cloudpickle.loads(val_data) self.logger.log(data=val_data.metrics, step=val_data.param_version) self.logger.log(data=val_data.timing_raw, step=val_data.param_version) - pprint(f"[FullyAsyncTrainer] Final validation metrics: {val_data.metrics}") + pprint(f"[FullyAsyncTrainer] Final validation metrics: {val_data.metrics}") + else: + pprint(f"[FullyAsyncTrainer] Final validation metrics: {val_data.metrics}") self.progress_bar.close() self._check_save_checkpoint(True, timing_raw) # TODO: 检查checkpoint diff --git a/recipe/fully_async_policy/message_queue.py b/recipe/fully_async_policy/message_queue.py index 22573f4b9d5..5094f9ab90a 100644 --- a/recipe/fully_async_policy/message_queue.py +++ b/recipe/fully_async_policy/message_queue.py @@ -79,9 +79,11 @@ async def put_sample(self, sample: Any, param_version: int) -> bool: """ async with self._lock: # If queue is full, remove the oldest sample (rarely happens) + is_drop = False if len(self.queue) >= self.max_queue_size: self.queue.popleft() self.dropped_samples += 1 + is_drop = True logger.warning("Queue full, dropped sample") self.queue.append(sample) self.total_produced += 1 @@ -91,7 +93,8 @@ async def put_sample(self, sample: Any, param_version: int) -> bool: if self.total_produced % 100 == 0: print(f"MessageQueue stats: produced={self.total_produced}, queue_size={len(self.queue)}") - + if is_drop: + return False return True async def get_sample(self) -> Any | None: diff --git a/recipe/fully_async_policy/param_sync.py b/recipe/fully_async_policy/param_sync.py index 35efdd9c950..2e11327afec 100644 --- a/recipe/fully_async_policy/param_sync.py +++ b/recipe/fully_async_policy/param_sync.py @@ -41,7 +41,8 @@ def __init__(self, config, trainer, rollouter, mq): self.weights_info = None self.sync_group_initialized = False self.sync_group_name = "actor_rollout" - self.wait_last = None + self.wait_last0 = None + self.wait_last1 = None # Statistics self.current_version = 0 @@ -90,12 +91,13 @@ def sync_weights(self, version, validate=False, global_steps=0): print(f"[ParameterSynchronizer] sync_weights success. cost {end_time - start_time:.2f} seconds") # Async Update rollout version & validation - self.rollouter.update_param_version.remote(version, validate, global_steps) - self.wait_last = self.rollouter.resume.remote() + self.wait_last0 = self.rollouter.update_param_version.remote(version, validate, global_steps) + self.wait_last1 = self.rollouter.resume.remote() def wait_last_sync(self): print("[ParameterSynchronizer] waiting last parameter sync and validate...") start_time = time.time() - if self.wait_last: - ray.get(self.wait_last) + if self.wait_last0 or self.wait_last1 : + ray.get(self.wait_last0) + ray.get(self.wait_last1) print(f"[ParameterSynchronizer], cost: {time.time() - start_time:.2f} seconds") From 840cc73ae87418601f3ab54d91056f4fd7008287 Mon Sep 17 00:00:00 2001 From: wangshulin02 <953550366@qq.com> Date: Wed, 10 Sep 2025 22:07:23 +0800 Subject: [PATCH 125/182] fix wait_last --- recipe/fully_async_policy/param_sync.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/recipe/fully_async_policy/param_sync.py b/recipe/fully_async_policy/param_sync.py index 2e11327afec..89deecad962 100644 --- a/recipe/fully_async_policy/param_sync.py +++ b/recipe/fully_async_policy/param_sync.py @@ -97,7 +97,8 @@ def sync_weights(self, version, validate=False, global_steps=0): def wait_last_sync(self): print("[ParameterSynchronizer] waiting last parameter sync and validate...") start_time = time.time() - if self.wait_last0 or self.wait_last1 : + if self.wait_last0: ray.get(self.wait_last0) + if self.wait_last1: ray.get(self.wait_last1) print(f"[ParameterSynchronizer], cost: {time.time() - start_time:.2f} seconds") From fd48a9a50cc53bf987415b6f25ae3eb9eef22e51 Mon Sep 17 00:00:00 2001 From: ArronHZG Date: Thu, 11 Sep 2025 14:27:26 +0800 Subject: [PATCH 126/182] qwen2.5 32B --- .../fsdp2_fully-async_48-80/run.sh | 153 ++++++++++++++++++ .../fsdp2_fully-async_48-80/runtime_env.yaml | 4 + 2 files changed, 157 insertions(+) create mode 100644 recipe/fully_async_policy/exp/qwen2-32B_128/fsdp2_fully-async_48-80/run.sh create mode 100644 recipe/fully_async_policy/exp/qwen2-32B_128/fsdp2_fully-async_48-80/runtime_env.yaml diff --git a/recipe/fully_async_policy/exp/qwen2-32B_128/fsdp2_fully-async_48-80/run.sh b/recipe/fully_async_policy/exp/qwen2-32B_128/fsdp2_fully-async_48-80/run.sh new file mode 100644 index 00000000000..8ab8f9be2d9 --- /dev/null +++ b/recipe/fully_async_policy/exp/qwen2-32B_128/fsdp2_fully-async_48-80/run.sh @@ -0,0 +1,153 @@ +#!/usr/bin/env bash +set -xeuo pipefail + +project_name='DAPO' +exp_name='dapo_qwen2-32B_20k_fsdp2_fully-async_48-80' + +# Paths +MODEL_PATH=/mnt/dolphinfs/ssd_pool/docker/user/hadoop-friday-studio/.friday/models/Qwen2.5-32B +CKPTS_DIR=./ckpts/${project_name}/${exp_name} +TRAIN_FILE=/mnt/dolphinfs/ssd_pool/docker/user/hadoop-friday-studio/FTI/houzhenggang/data/dapo/dapo-math-17k.parquet +TEST_FILE=/mnt/dolphinfs/ssd_pool/docker/user/hadoop-friday-studio/FTI/houzhenggang/data/dapo/aime-2024.parquet + +rollout_mode="async" +rollout_name="vllm" # sglang or vllm +if [ "$rollout_mode" = "async" ]; then + export VLLM_USE_V1=1 + return_raw_chat="True" +fi + +# Algorithm parameters +adv_estimator=grpo + +use_kl_in_reward=False +kl_coef=0.0 +use_kl_loss=False +kl_loss_coef=0.0 + +clip_ratio_low=0.2 +clip_ratio_high=0.28 + +# Response length parameters +max_prompt_length=$((1024 * 2)) +max_response_length=$((1024 * 20)) +enable_overlong_buffer=True +overlong_buffer_len=$((1024 * 4)) +overlong_penalty_factor=1.0 + +# Training parameters +loss_agg_mode="token-mean" + +# Algorithm +temperature=1.0 +top_p=1.0 +top_k=-1 # 0 for HF rollout, -1 for vLLM rollout +val_top_p=0.7 + +# Performance Related Parameter +use_dynamic_bsz=True +actor_ppo_max_token_len=$(((max_prompt_length + max_response_length) * 2)) +infer_ppo_max_token_len=$(((max_prompt_length + max_response_length) * 3)) +ref_offload=True +actor_offload=False +gen_tp=4 +sp_size=8 +fsdp_size=-1 + +# Fully async specific parameters +NNODES_ROLLOUT=${NNODES_ROLLOUT:-6} +NNODES_TRAIN=${NNODES_TRAIN:-10} +NGPUS_PER_NODE=${NGPUS_PER_NODE:-8} + +train_prompt_bsz=0 +gen_prompt_bsz=1 +n_resp_per_prompt=16 +train_prompt_mini_bsz=32 +total_rollout_steps=$(((512*400))) +test_freq=20 +staleness_threshold=0.1 +trigger_parameter_sync_step=3 +partial_rollout=True + +python -m recipe.fully_async_policy.fully_async_main \ + data.train_files="${TRAIN_FILE}" \ + data.val_files="${TEST_FILE}" \ + data.prompt_key=prompt \ + data.truncation='left' \ + data.max_prompt_length=${max_prompt_length} \ + data.max_response_length=${max_response_length} \ + data.train_batch_size=${train_prompt_bsz} \ + data.gen_batch_size=${gen_prompt_bsz} \ + data.return_raw_chat=${return_raw_chat} \ + actor_rollout_ref.rollout.n=${n_resp_per_prompt} \ + algorithm.adv_estimator=${adv_estimator} \ + algorithm.use_kl_in_reward=${use_kl_in_reward} \ + algorithm.kl_ctrl.kl_coef=${kl_coef} \ + actor_rollout_ref.actor.strategy=fsdp2 \ + critic.strategy=fsdp2 \ + actor_rollout_ref.actor.use_kl_loss=${use_kl_loss} \ + actor_rollout_ref.actor.kl_loss_coef=${kl_loss_coef} \ + actor_rollout_ref.actor.clip_ratio_low=${clip_ratio_low} \ + actor_rollout_ref.actor.clip_ratio_high=${clip_ratio_high} \ + actor_rollout_ref.actor.clip_ratio_c=10.0 \ + actor_rollout_ref.model.use_remove_padding=True \ + actor_rollout_ref.hybrid_engine=False \ + +actor_rollout_ref.model.override_config.max_position_embeddings=32768 \ + actor_rollout_ref.actor.use_dynamic_bsz=${use_dynamic_bsz} \ + actor_rollout_ref.ref.log_prob_use_dynamic_bsz=${use_dynamic_bsz} \ + actor_rollout_ref.rollout.log_prob_use_dynamic_bsz=${use_dynamic_bsz} \ + actor_rollout_ref.actor.ppo_max_token_len_per_gpu=${actor_ppo_max_token_len} \ + actor_rollout_ref.ref.log_prob_max_token_len_per_gpu=${infer_ppo_max_token_len} \ + actor_rollout_ref.rollout.log_prob_max_token_len_per_gpu=${infer_ppo_max_token_len} \ + actor_rollout_ref.model.path="${MODEL_PATH}" \ + actor_rollout_ref.actor.optim.lr=1e-6 \ + actor_rollout_ref.actor.optim.lr_warmup_steps=10 \ + actor_rollout_ref.actor.optim.weight_decay=0.1 \ + actor_rollout_ref.actor.ppo_mini_batch_size=${train_prompt_mini_bsz} \ + actor_rollout_ref.actor.fsdp_config.param_offload=${actor_offload} \ + actor_rollout_ref.actor.fsdp_config.optimizer_offload=${actor_offload} \ + actor_rollout_ref.actor.entropy_coeff=0 \ + actor_rollout_ref.actor.grad_clip=1.0 \ + actor_rollout_ref.actor.loss_agg_mode=${loss_agg_mode} \ + actor_rollout_ref.actor.ulysses_sequence_parallel_size=${sp_size} \ + actor_rollout_ref.rollout.gpu_memory_utilization=0.80 \ + actor_rollout_ref.rollout.tensor_model_parallel_size=${gen_tp} \ + actor_rollout_ref.rollout.enable_chunked_prefill=True \ + actor_rollout_ref.rollout.max_num_batched_tokens=$((max_prompt_length + max_response_length)) \ + actor_rollout_ref.rollout.temperature=${temperature} \ + actor_rollout_ref.rollout.top_p=${top_p} \ + actor_rollout_ref.rollout.top_k=${top_k} \ + actor_rollout_ref.rollout.val_kwargs.temperature=${temperature} \ + actor_rollout_ref.rollout.val_kwargs.top_p=${val_top_p} \ + actor_rollout_ref.rollout.val_kwargs.top_k=${top_k} \ + actor_rollout_ref.rollout.val_kwargs.do_sample=True \ + actor_rollout_ref.rollout.val_kwargs.n=1 \ + actor_rollout_ref.ref.fsdp_config.param_offload=${ref_offload} \ + actor_rollout_ref.ref.ulysses_sequence_parallel_size=${sp_size} \ + actor_rollout_ref.actor.fsdp_config.fsdp_size=${fsdp_size} \ + actor_rollout_ref.rollout.name=${rollout_name} \ + actor_rollout_ref.rollout.mode=${rollout_mode} \ + actor_rollout_ref.rollout.calculate_log_probs=True \ + reward_model.reward_manager=dapo \ + +reward_model.reward_kwargs.overlong_buffer_cfg.enable=${enable_overlong_buffer} \ + +reward_model.reward_kwargs.overlong_buffer_cfg.len=${overlong_buffer_len} \ + +reward_model.reward_kwargs.overlong_buffer_cfg.penalty_factor=${overlong_penalty_factor} \ + +reward_model.reward_kwargs.overlong_buffer_cfg.log=False \ + +reward_model.reward_kwargs.max_resp_len=${max_response_length} \ + trainer.logger=['console','tensorboard'] \ + trainer.project_name="${project_name}" \ + trainer.experiment_name="${exp_name}" \ + trainer.val_before_train=True \ + trainer.save_freq=-1 \ + trainer.default_local_dir="${CKPTS_DIR}" \ + trainer.resume_mode=auto \ + trainer.nnodes="${NNODES_TRAIN}" \ + trainer.n_gpus_per_node="${NGPUS_PER_NODE}" \ + rollout.nnodes="${NNODES_ROLLOUT}" \ + rollout.n_gpus_per_node="${NGPUS_PER_NODE}" \ + rollout.total_rollout_steps="${total_rollout_steps}" \ + rollout.total_epochs=10 \ + rollout.test_freq="${test_freq}" \ + async_training.staleness_threshold="${staleness_threshold}" \ + async_training.trigger_parameter_sync_step="${trigger_parameter_sync_step}" \ + async_training.partial_rollout="${partial_rollout}" diff --git a/recipe/fully_async_policy/exp/qwen2-32B_128/fsdp2_fully-async_48-80/runtime_env.yaml b/recipe/fully_async_policy/exp/qwen2-32B_128/fsdp2_fully-async_48-80/runtime_env.yaml new file mode 100644 index 00000000000..0caf9804ebc --- /dev/null +++ b/recipe/fully_async_policy/exp/qwen2-32B_128/fsdp2_fully-async_48-80/runtime_env.yaml @@ -0,0 +1,4 @@ +env_vars: + VLLM_USE_V1: "1" + TENSORBOARD_DIR: "/mnt/dolphinfs/ssd_pool/docker/user/hadoop-friday-studio/FTI/houzhenggang/expeirments/qwen2-32B-128/dapo_qwen2-32B_20k_fsdp2_fully-async_48-80" + HYDRA_FULL_ERROR: "1" \ No newline at end of file From 8f445b04b1e78424f75aa66c013c77de020d0c64 Mon Sep 17 00:00:00 2001 From: ArronHZG Date: Thu, 11 Sep 2025 14:58:25 +0800 Subject: [PATCH 127/182] fsdp2_fully-async_16-16 --- .../fsdp2_fully-async_16-16/run.sh | 14 -- .../fsdp2_fully-async_16-16_stal0.1/run.sh | 168 ------------------ .../runtime_env.yaml | 4 - 3 files changed, 186 deletions(-) delete mode 100644 recipe/fully_async_policy/exp/qwen2-7B-math_32/fsdp2_fully-async_16-16_stal0.1/run.sh delete mode 100644 recipe/fully_async_policy/exp/qwen2-7B-math_32/fsdp2_fully-async_16-16_stal0.1/runtime_env.yaml diff --git a/recipe/fully_async_policy/exp/qwen2-7B-math_32/fsdp2_fully-async_16-16/run.sh b/recipe/fully_async_policy/exp/qwen2-7B-math_32/fsdp2_fully-async_16-16/run.sh index cdefd5a4b57..2a22fd97d08 100644 --- a/recipe/fully_async_policy/exp/qwen2-7B-math_32/fsdp2_fully-async_16-16/run.sh +++ b/recipe/fully_async_policy/exp/qwen2-7B-math_32/fsdp2_fully-async_16-16/run.sh @@ -5,22 +5,8 @@ project_name='DAPO' exp_name='dapo_qwen2-7B-math_28k_fsdp2_fully-async_16-16_mbs32_tpf16_fixmcs' # Ray -# RAY_ADDRESS=${RAY_ADDRESS:-"http://localhost:8265"} -# WORKING_DIR=${WORKING_DIR:-"${PWD}"} -# RUNTIME_ENV=${RUNTIME_ENV:-"${WORKING_DIR}/verl/trainer/runtime_env.yaml"} -# Paths -RAY_DATA_HOME=${RAY_DATA_HOME:-"${HOME}/verl"} -# very important! please modify the max_position_embeddings in config.json to 32768 after downloading from huggingface -MODEL_PATH=${MODEL_PATH:-"${RAY_DATA_HOME}/models/Qwen2.5-Math-7B"} -CKPTS_DIR=${CKPTS_DIR:-"${RAY_DATA_HOME}/ckpts/${project_name}/${exp_name}"} -TRAIN_FILE=${TRAIN_FILE:-"${RAY_DATA_HOME}/data/dapo-math-17k.parquet"} -TEST_FILE=${TEST_FILE:-"${RAY_DATA_HOME}/data/aime-2024.parquet"} - -# MODEL_PATH=/mnt/dolphinfs/hdd_pool/docker/user/hadoop-djst-algoplat/houzhenggang/modelscope/hub/models/Qwen/Qwen2___5-Math-7B MODEL_PATH=/mnt/dolphinfs/ssd_pool/docker/user/hadoop-friday-studio/FTI/houzhenggang/model/Qwen2___5-Math-7B CKPTS_DIR=./ckpts/${project_name}/${exp_name} -# TRAIN_FILE=/mnt/dolphinfs/hdd_pool/docker/user/hadoop-djst-algoplat/houzhenggang/data/dapo/dapo-math-17k.parquet -# TEST_FILE=/mnt/dolphinfs/hdd_pool/docker/user/hadoop-djst-algoplat/houzhenggang/data/dapo/aime-2024.parquet TRAIN_FILE=/mnt/dolphinfs/ssd_pool/docker/user/hadoop-friday-studio/FTI/houzhenggang/data/dapo/dapo-math-17k.parquet TEST_FILE=/mnt/dolphinfs/ssd_pool/docker/user/hadoop-friday-studio/FTI/houzhenggang/data/dapo/aime-2024.parquet diff --git a/recipe/fully_async_policy/exp/qwen2-7B-math_32/fsdp2_fully-async_16-16_stal0.1/run.sh b/recipe/fully_async_policy/exp/qwen2-7B-math_32/fsdp2_fully-async_16-16_stal0.1/run.sh deleted file mode 100644 index 2217661dd33..00000000000 --- a/recipe/fully_async_policy/exp/qwen2-7B-math_32/fsdp2_fully-async_16-16_stal0.1/run.sh +++ /dev/null @@ -1,168 +0,0 @@ -#!/usr/bin/env bash -set -xeuo pipefail - -project_name='DAPO' -exp_name='dapo_qwen2-7B-math_28k_fsdp2_fully-async_16-16_mbs32_tpf16_stal0.1' - -# Ray -# RAY_ADDRESS=${RAY_ADDRESS:-"http://localhost:8265"} -# WORKING_DIR=${WORKING_DIR:-"${PWD}"} -# RUNTIME_ENV=${RUNTIME_ENV:-"${WORKING_DIR}/verl/trainer/runtime_env.yaml"} -# Paths -RAY_DATA_HOME=${RAY_DATA_HOME:-"${HOME}/verl"} -# very important! please modify the max_position_embeddings in config.json to 32768 after downloading from huggingface -MODEL_PATH=${MODEL_PATH:-"${RAY_DATA_HOME}/models/Qwen2.5-Math-7B"} -CKPTS_DIR=${CKPTS_DIR:-"${RAY_DATA_HOME}/ckpts/${project_name}/${exp_name}"} -TRAIN_FILE=${TRAIN_FILE:-"${RAY_DATA_HOME}/data/dapo-math-17k.parquet"} -TEST_FILE=${TEST_FILE:-"${RAY_DATA_HOME}/data/aime-2024.parquet"} - -# MODEL_PATH=/mnt/dolphinfs/hdd_pool/docker/user/hadoop-djst-algoplat/houzhenggang/modelscope/hub/models/Qwen/Qwen2___5-Math-7B -MODEL_PATH=/mnt/dolphinfs/ssd_pool/docker/user/hadoop-friday-studio/FTI/houzhenggang/model/Qwen2___5-Math-7B -CKPTS_DIR=./ckpts/${project_name}/${exp_name} -# TRAIN_FILE=/mnt/dolphinfs/hdd_pool/docker/user/hadoop-djst-algoplat/houzhenggang/data/dapo/dapo-math-17k.parquet -# TEST_FILE=/mnt/dolphinfs/hdd_pool/docker/user/hadoop-djst-algoplat/houzhenggang/data/dapo/aime-2024.parquet -TRAIN_FILE=/mnt/dolphinfs/ssd_pool/docker/user/hadoop-friday-studio/FTI/houzhenggang/data/dapo/dapo-math-17k.parquet -TEST_FILE=/mnt/dolphinfs/ssd_pool/docker/user/hadoop-friday-studio/FTI/houzhenggang/data/dapo/aime-2024.parquet - -rollout_mode="async" -rollout_name="vllm" # sglang or vllm -if [ "$rollout_mode" = "async" ]; then - export VLLM_USE_V1=1 - return_raw_chat="True" -fi - -# Algorithm parameters -adv_estimator=grpo - -use_kl_in_reward=False -kl_coef=0.0 -use_kl_loss=False -kl_loss_coef=0.0 - -clip_ratio_low=0.2 -clip_ratio_high=0.28 - -# Response length parameters -max_prompt_length=$((1024 * 2)) -max_response_length=$((1024 * 28)) -enable_overlong_buffer=True -overlong_buffer_len=$((1024 * 4)) -overlong_penalty_factor=1.0 - -# Training parameters -loss_agg_mode="token-mean" - -# Algorithm -temperature=1.0 -top_p=1.0 -top_k=-1 # 0 for HF rollout, -1 for vLLM rollout -val_top_p=0.7 - -# Performance Related Parameter -use_dynamic_bsz=True -actor_ppo_max_token_len=$(((max_prompt_length + max_response_length) * 2)) -infer_ppo_max_token_len=$(((max_prompt_length + max_response_length) * 3)) -ref_offload=True -actor_offload=False -gen_tp=4 -sp_size=4 -fsdp_size=2 - -# Fully async specific parameters -NNODES_ROLLOUT=${NNODES_ROLLOUT:-2} -NNODES_TRAIN=${NNODES_TRAIN:-2} -NGPUS_PER_NODE=${NGPUS_PER_NODE:-8} - - -train_prompt_bsz=0 -gen_prompt_bsz=1 -n_resp_per_prompt=16 -train_prompt_mini_bsz=32 -total_rollout_steps=$(((512*400))) -test_freq=20 -staleness_threshold=0.1 -trigger_parameter_sync_step=16 -partial_rollout=True - -python -m recipe.fully_async_policy.fully_async_main \ - data.train_files="${TRAIN_FILE}" \ - data.val_files="${TEST_FILE}" \ - data.prompt_key=prompt \ - data.truncation='left' \ - data.max_prompt_length=${max_prompt_length} \ - data.max_response_length=${max_response_length} \ - data.train_batch_size=${train_prompt_bsz} \ - data.gen_batch_size=${gen_prompt_bsz} \ - data.return_raw_chat=${return_raw_chat} \ - actor_rollout_ref.rollout.n=${n_resp_per_prompt} \ - algorithm.adv_estimator=${adv_estimator} \ - algorithm.use_kl_in_reward=${use_kl_in_reward} \ - algorithm.kl_ctrl.kl_coef=${kl_coef} \ - actor_rollout_ref.actor.strategy=fsdp2 \ - critic.strategy=fsdp2 \ - actor_rollout_ref.actor.use_kl_loss=${use_kl_loss} \ - actor_rollout_ref.actor.kl_loss_coef=${kl_loss_coef} \ - actor_rollout_ref.actor.clip_ratio_low=${clip_ratio_low} \ - actor_rollout_ref.actor.clip_ratio_high=${clip_ratio_high} \ - actor_rollout_ref.actor.clip_ratio_c=10.0 \ - actor_rollout_ref.model.use_remove_padding=True \ - actor_rollout_ref.hybrid_engine=False \ - +actor_rollout_ref.model.override_config.max_position_embeddings=32768 \ - actor_rollout_ref.actor.use_dynamic_bsz=${use_dynamic_bsz} \ - actor_rollout_ref.ref.log_prob_use_dynamic_bsz=${use_dynamic_bsz} \ - actor_rollout_ref.rollout.log_prob_use_dynamic_bsz=${use_dynamic_bsz} \ - actor_rollout_ref.actor.ppo_max_token_len_per_gpu=${actor_ppo_max_token_len} \ - actor_rollout_ref.ref.log_prob_max_token_len_per_gpu=${infer_ppo_max_token_len} \ - actor_rollout_ref.rollout.log_prob_max_token_len_per_gpu=${infer_ppo_max_token_len} \ - actor_rollout_ref.model.path="${MODEL_PATH}" \ - actor_rollout_ref.actor.optim.lr=1e-6 \ - actor_rollout_ref.actor.optim.lr_warmup_steps=10 \ - actor_rollout_ref.actor.optim.weight_decay=0.1 \ - actor_rollout_ref.actor.ppo_mini_batch_size=${train_prompt_mini_bsz} \ - actor_rollout_ref.actor.fsdp_config.param_offload=${actor_offload} \ - actor_rollout_ref.actor.fsdp_config.optimizer_offload=${actor_offload} \ - actor_rollout_ref.actor.entropy_coeff=0 \ - actor_rollout_ref.actor.grad_clip=1.0 \ - actor_rollout_ref.actor.loss_agg_mode=${loss_agg_mode} \ - actor_rollout_ref.actor.ulysses_sequence_parallel_size=${sp_size} \ - actor_rollout_ref.rollout.gpu_memory_utilization=0.80 \ - actor_rollout_ref.rollout.tensor_model_parallel_size=${gen_tp} \ - actor_rollout_ref.rollout.enable_chunked_prefill=True \ - actor_rollout_ref.rollout.max_num_batched_tokens=$((max_prompt_length + max_response_length)) \ - actor_rollout_ref.rollout.temperature=${temperature} \ - actor_rollout_ref.rollout.top_p=${top_p} \ - actor_rollout_ref.rollout.top_k=${top_k} \ - actor_rollout_ref.rollout.val_kwargs.temperature=${temperature} \ - actor_rollout_ref.rollout.val_kwargs.top_p=${val_top_p} \ - actor_rollout_ref.rollout.val_kwargs.top_k=${top_k} \ - actor_rollout_ref.rollout.val_kwargs.do_sample=True \ - actor_rollout_ref.rollout.val_kwargs.n=1 \ - actor_rollout_ref.ref.fsdp_config.param_offload=${ref_offload} \ - actor_rollout_ref.ref.ulysses_sequence_parallel_size=${sp_size} \ - actor_rollout_ref.actor.fsdp_config.fsdp_size=${fsdp_size} \ - actor_rollout_ref.rollout.name=${rollout_name} \ - actor_rollout_ref.rollout.mode=${rollout_mode} \ - actor_rollout_ref.rollout.calculate_log_probs=True \ - reward_model.reward_manager=dapo \ - +reward_model.reward_kwargs.overlong_buffer_cfg.enable=${enable_overlong_buffer} \ - +reward_model.reward_kwargs.overlong_buffer_cfg.len=${overlong_buffer_len} \ - +reward_model.reward_kwargs.overlong_buffer_cfg.penalty_factor=${overlong_penalty_factor} \ - +reward_model.reward_kwargs.overlong_buffer_cfg.log=False \ - +reward_model.reward_kwargs.max_resp_len=${max_response_length} \ - trainer.logger=['console','tensorboard'] \ - trainer.project_name="${project_name}" \ - trainer.experiment_name="${exp_name}" \ - trainer.val_before_train=True \ - trainer.save_freq=-1 \ - trainer.default_local_dir="${CKPTS_DIR}" \ - trainer.resume_mode=auto \ - trainer.nnodes="${NNODES_TRAIN}" \ - trainer.n_gpus_per_node="${NGPUS_PER_NODE}" \ - rollout.nnodes="${NNODES_ROLLOUT}" \ - rollout.n_gpus_per_node="${NGPUS_PER_NODE}" \ - rollout.total_rollout_steps="${total_rollout_steps}" \ - rollout.total_epochs=10 \ - rollout.test_freq="${test_freq}" \ - async_training.staleness_threshold="${staleness_threshold}" \ - async_training.trigger_parameter_sync_step="${trigger_parameter_sync_step}" \ - async_training.partial_rollout="${partial_rollout}" diff --git a/recipe/fully_async_policy/exp/qwen2-7B-math_32/fsdp2_fully-async_16-16_stal0.1/runtime_env.yaml b/recipe/fully_async_policy/exp/qwen2-7B-math_32/fsdp2_fully-async_16-16_stal0.1/runtime_env.yaml deleted file mode 100644 index 0b188206127..00000000000 --- a/recipe/fully_async_policy/exp/qwen2-7B-math_32/fsdp2_fully-async_16-16_stal0.1/runtime_env.yaml +++ /dev/null @@ -1,4 +0,0 @@ -env_vars: - VLLM_USE_V1: "1" - TENSORBOARD_DIR: "/mnt/dolphinfs/ssd_pool/docker/user/hadoop-friday-studio/FTI/houzhenggang/expeirments/qwen2-7B-math/dapo_qwen2-7B-math_28k_fsdp2_fully-async_16-16_mbs32_tpf16_stal0.1" - HYDRA_FULL_ERROR: "1" \ No newline at end of file From f85c1381217f8b03c0bd6f49e7a1fd95dd757215 Mon Sep 17 00:00:00 2001 From: wangshulin02 <953550366@qq.com> Date: Fri, 12 Sep 2025 19:30:44 +0800 Subject: [PATCH 128/182] update note in config, param_sync, mq --- .../config/fully_async_ppo_trainer.yaml | 45 +++++++++++++------ recipe/fully_async_policy/message_queue.py | 6 +-- recipe/fully_async_policy/param_sync.py | 17 +++---- 3 files changed, 43 insertions(+), 25 deletions(-) diff --git a/recipe/fully_async_policy/config/fully_async_ppo_trainer.yaml b/recipe/fully_async_policy/config/fully_async_ppo_trainer.yaml index c1f94b56b6b..0d6c05a9a5c 100644 --- a/recipe/fully_async_policy/config/fully_async_ppo_trainer.yaml +++ b/recipe/fully_async_policy/config/fully_async_ppo_trainer.yaml @@ -6,23 +6,42 @@ defaults: - ppo_trainer - _self_ -# ============= 完全异步训练配置 (Fully Async Training Config) ============= - async_training: - # 新鲜度控制 (Freshness Control) - staleness_threshold: 1 # 样本新鲜度阈值 - trigger_parameter_sync_step: 4 # >=1 train 每次训练一个batch, 迭代多少次后触发更新 - partial_rollout: True # 同步参数时,是否中断 rollout - use_rollout_log_probs: True -# Rollout配置 + # Maximum samples staleness threshold + staleness_threshold: 0.1 + + # Frequency of parameter synchronization between rollouter and trainer, + # One step means trainer obtains a batch of required samples + trigger_parameter_sync_step: 4 + + # When synchronizing parameters, whether to interrupt rollouter and perform partial rollout + partial_rollout: True + + # Whether to use rollout log probs for training + use_rollout_log_probs: True + +# Rollout config rollout: - nnodes: 1 # Number of nodes used in the rollout - n_gpus_per_node: 8 # Number of GPUs per node - n: 4 # 每个prompt生成的响应数量 + + # Number of nodes used in the rollout + nnodes: 1 + + # Number of GPUs per node + n_gpus_per_node: 8 + + # number of responses (i.e. num sample times). > 1 for grpo + n: 4 + + # Number of epochs in training total_rollout_steps: 100 + + # total_epochs: 10 - test_freq: 1 # 测试频率, 每多少次参数更新后进行一次测试 + + # Test frequency, how many times a parameter update triggers a validation + test_freq: 1 data: - gen_batch_size: 32 + # Number of samples generated, currently only support 1 + gen_batch_size: 1 diff --git a/recipe/fully_async_policy/message_queue.py b/recipe/fully_async_policy/message_queue.py index 5094f9ab90a..85860c6f2a0 100644 --- a/recipe/fully_async_policy/message_queue.py +++ b/recipe/fully_async_policy/message_queue.py @@ -27,12 +27,10 @@ class MessageQueue: """ Simplified Ray-based asynchronous message queue for communication between Rollouter and Trainer - 使用 asyncio 实现异步消息队列 """ def __init__(self, config: DictConfig, max_queue_size: int = 1000): self.config = config - # 确保 max_queue_size 不为 None if max_queue_size is None: raise ValueError(f"max_queue_size cannot be None, got: {max_queue_size}") self.max_queue_size = int(max_queue_size) @@ -52,7 +50,7 @@ def __init__(self, config: DictConfig, max_queue_size: int = 1000): # Asyncio for message handling self.running = True - # async safe - 在第一次使用时初始化 + # async safe self._lock = asyncio.Lock() self._consumer_condition = asyncio.Condition(self._lock) @@ -249,7 +247,7 @@ async def get_memory_usage(self) -> dict: future = self.queue_actor.get_memory_usage.remote() return await asyncio.wrap_future(future.future()) - # 为了兼容性,保留同步版本的方法(但标记为deprecated) + # Synchronous version of the method (deprecated) def put_sample_sync(self, sample: Any, param_version: int) -> bool: """Put batch into queue (sync - deprecated, use put_sample instead)""" return ray.get(self.queue_actor.put_sample.remote(sample, param_version)) diff --git a/recipe/fully_async_policy/param_sync.py b/recipe/fully_async_policy/param_sync.py index 89deecad962..2a58292ff78 100644 --- a/recipe/fully_async_policy/param_sync.py +++ b/recipe/fully_async_policy/param_sync.py @@ -41,8 +41,8 @@ def __init__(self, config, trainer, rollouter, mq): self.weights_info = None self.sync_group_initialized = False self.sync_group_name = "actor_rollout" - self.wait_last0 = None - self.wait_last1 = None + self.wait_last_update = None + self.wait_last_resume = None # Statistics self.current_version = 0 @@ -74,6 +74,7 @@ def _init_sync_group(self): ) def sync_weights(self, version, validate=False, global_steps=0): + """Sync weights between trainer and rollouter, and update parameter version""" start_time = time.time() self.current_version = version @@ -91,14 +92,14 @@ def sync_weights(self, version, validate=False, global_steps=0): print(f"[ParameterSynchronizer] sync_weights success. cost {end_time - start_time:.2f} seconds") # Async Update rollout version & validation - self.wait_last0 = self.rollouter.update_param_version.remote(version, validate, global_steps) - self.wait_last1 = self.rollouter.resume.remote() + self.wait_last_update = self.rollouter.update_param_version.remote(version, validate, global_steps) + self.wait_last_resume = self.rollouter.resume.remote() def wait_last_sync(self): print("[ParameterSynchronizer] waiting last parameter sync and validate...") start_time = time.time() - if self.wait_last0: - ray.get(self.wait_last0) - if self.wait_last1: - ray.get(self.wait_last1) + if self.wait_last_update: + ray.get(self.wait_last_update) + if self.wait_last_resume: + ray.get(self.wait_last_resume) print(f"[ParameterSynchronizer], cost: {time.time() - start_time:.2f} seconds") From 56f853b38b794ce261e43ed6f5e4b1c201932b6d Mon Sep 17 00:00:00 2001 From: ArronHZG Date: Sat, 13 Sep 2025 23:57:26 +0800 Subject: [PATCH 129/182] qwen2-32B --- .../exp/qwen2-32B_128/fsdp2_fully-async_64-64/run.sh | 6 +++--- .../fsdp2_fully-async_64-64/runtime_env.yaml | 2 +- .../run.sh | 10 +++++----- .../runtime_env.yaml | 2 +- 4 files changed, 10 insertions(+), 10 deletions(-) rename recipe/fully_async_policy/exp/qwen2-32B_128/{fsdp2_fully-async_48-80 => fsdp2_fully-async_80-48}/run.sh (97%) rename recipe/fully_async_policy/exp/qwen2-32B_128/{fsdp2_fully-async_48-80 => fsdp2_fully-async_80-48}/runtime_env.yaml (81%) diff --git a/recipe/fully_async_policy/exp/qwen2-32B_128/fsdp2_fully-async_64-64/run.sh b/recipe/fully_async_policy/exp/qwen2-32B_128/fsdp2_fully-async_64-64/run.sh index 270533a84c4..8427547d161 100644 --- a/recipe/fully_async_policy/exp/qwen2-32B_128/fsdp2_fully-async_64-64/run.sh +++ b/recipe/fully_async_policy/exp/qwen2-32B_128/fsdp2_fully-async_64-64/run.sh @@ -2,7 +2,7 @@ set -xeuo pipefail project_name='DAPO' -exp_name='dapo_qwen2-32B_20k_fsdp2_fully-async_64-64' +exp_name='dapo_qwen2-32B_20k_fsdp2_fully-async_64-64-tps1' # Paths MODEL_PATH=/mnt/dolphinfs/ssd_pool/docker/user/hadoop-friday-studio/.friday/models/Qwen2.5-32B @@ -62,11 +62,11 @@ NGPUS_PER_NODE=${NGPUS_PER_NODE:-8} train_prompt_bsz=0 gen_prompt_bsz=1 n_resp_per_prompt=16 -train_prompt_mini_bsz=32 +train_prompt_mini_bsz=128 total_rollout_steps=$(((512*400))) test_freq=20 staleness_threshold=0.1 -trigger_parameter_sync_step=4 +trigger_parameter_sync_step=1 partial_rollout=True python -m recipe.fully_async_policy.fully_async_main \ diff --git a/recipe/fully_async_policy/exp/qwen2-32B_128/fsdp2_fully-async_64-64/runtime_env.yaml b/recipe/fully_async_policy/exp/qwen2-32B_128/fsdp2_fully-async_64-64/runtime_env.yaml index 77590fb2709..ea506be787e 100644 --- a/recipe/fully_async_policy/exp/qwen2-32B_128/fsdp2_fully-async_64-64/runtime_env.yaml +++ b/recipe/fully_async_policy/exp/qwen2-32B_128/fsdp2_fully-async_64-64/runtime_env.yaml @@ -1,4 +1,4 @@ env_vars: VLLM_USE_V1: "1" - TENSORBOARD_DIR: "/mnt/dolphinfs/ssd_pool/docker/user/hadoop-friday-studio/FTI/houzhenggang/expeirments/qwen2-32B-128/dapo_qwen2-32B_20k_fsdp2_fully-async_64-64" + TENSORBOARD_DIR: "/mnt/dolphinfs/ssd_pool/docker/user/hadoop-friday-studio/FTI/houzhenggang/expeirments/qwen2-32B-128/dapo_qwen2-32B_20k_fsdp2_fully-async_64-64-tps1" HYDRA_FULL_ERROR: "1" \ No newline at end of file diff --git a/recipe/fully_async_policy/exp/qwen2-32B_128/fsdp2_fully-async_48-80/run.sh b/recipe/fully_async_policy/exp/qwen2-32B_128/fsdp2_fully-async_80-48/run.sh similarity index 97% rename from recipe/fully_async_policy/exp/qwen2-32B_128/fsdp2_fully-async_48-80/run.sh rename to recipe/fully_async_policy/exp/qwen2-32B_128/fsdp2_fully-async_80-48/run.sh index 8ab8f9be2d9..fd2874d0f98 100644 --- a/recipe/fully_async_policy/exp/qwen2-32B_128/fsdp2_fully-async_48-80/run.sh +++ b/recipe/fully_async_policy/exp/qwen2-32B_128/fsdp2_fully-async_80-48/run.sh @@ -2,7 +2,7 @@ set -xeuo pipefail project_name='DAPO' -exp_name='dapo_qwen2-32B_20k_fsdp2_fully-async_48-80' +exp_name='dapo_qwen2-32B_20k_fsdp2_fully-async_80-48-tps1' # Paths MODEL_PATH=/mnt/dolphinfs/ssd_pool/docker/user/hadoop-friday-studio/.friday/models/Qwen2.5-32B @@ -55,18 +55,18 @@ sp_size=8 fsdp_size=-1 # Fully async specific parameters -NNODES_ROLLOUT=${NNODES_ROLLOUT:-6} -NNODES_TRAIN=${NNODES_TRAIN:-10} +NNODES_ROLLOUT=${NNODES_ROLLOUT:-10} +NNODES_TRAIN=${NNODES_TRAIN:-6} NGPUS_PER_NODE=${NGPUS_PER_NODE:-8} train_prompt_bsz=0 gen_prompt_bsz=1 n_resp_per_prompt=16 -train_prompt_mini_bsz=32 +train_prompt_mini_bsz=128 total_rollout_steps=$(((512*400))) test_freq=20 staleness_threshold=0.1 -trigger_parameter_sync_step=3 +trigger_parameter_sync_step=1 partial_rollout=True python -m recipe.fully_async_policy.fully_async_main \ diff --git a/recipe/fully_async_policy/exp/qwen2-32B_128/fsdp2_fully-async_48-80/runtime_env.yaml b/recipe/fully_async_policy/exp/qwen2-32B_128/fsdp2_fully-async_80-48/runtime_env.yaml similarity index 81% rename from recipe/fully_async_policy/exp/qwen2-32B_128/fsdp2_fully-async_48-80/runtime_env.yaml rename to recipe/fully_async_policy/exp/qwen2-32B_128/fsdp2_fully-async_80-48/runtime_env.yaml index 0caf9804ebc..9997c4130f2 100644 --- a/recipe/fully_async_policy/exp/qwen2-32B_128/fsdp2_fully-async_48-80/runtime_env.yaml +++ b/recipe/fully_async_policy/exp/qwen2-32B_128/fsdp2_fully-async_80-48/runtime_env.yaml @@ -1,4 +1,4 @@ env_vars: VLLM_USE_V1: "1" - TENSORBOARD_DIR: "/mnt/dolphinfs/ssd_pool/docker/user/hadoop-friday-studio/FTI/houzhenggang/expeirments/qwen2-32B-128/dapo_qwen2-32B_20k_fsdp2_fully-async_48-80" + TENSORBOARD_DIR: "/mnt/dolphinfs/ssd_pool/docker/user/hadoop-friday-studio/FTI/houzhenggang/expeirments/qwen2-32B-128/dapo_qwen2-32B_20k_fsdp2_fully-async_80-48-tps1" HYDRA_FULL_ERROR: "1" \ No newline at end of file From 9f53cd71d571c2649300ab3602916488c2dc38b8 Mon Sep 17 00:00:00 2001 From: ArronHZG Date: Mon, 15 Sep 2025 14:06:20 +0800 Subject: [PATCH 130/182] update 32 workers --- .../exp/qwen2-7B-math_32/fsdp2_fully-async_16-16/run.sh | 2 +- .../qwen2-7B-math_32/fsdp2_fully-async_16-16/runtime_env.yaml | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/recipe/fully_async_policy/exp/qwen2-7B-math_32/fsdp2_fully-async_16-16/run.sh b/recipe/fully_async_policy/exp/qwen2-7B-math_32/fsdp2_fully-async_16-16/run.sh index 2a22fd97d08..9fca6da9878 100644 --- a/recipe/fully_async_policy/exp/qwen2-7B-math_32/fsdp2_fully-async_16-16/run.sh +++ b/recipe/fully_async_policy/exp/qwen2-7B-math_32/fsdp2_fully-async_16-16/run.sh @@ -52,7 +52,7 @@ ref_offload=True actor_offload=False gen_tp=4 sp_size=4 -fsdp_size=2 +fsdp_size=8 # Fully async specific parameters NNODES_ROLLOUT=${NNODES_ROLLOUT:-2} diff --git a/recipe/fully_async_policy/exp/qwen2-7B-math_32/fsdp2_fully-async_16-16/runtime_env.yaml b/recipe/fully_async_policy/exp/qwen2-7B-math_32/fsdp2_fully-async_16-16/runtime_env.yaml index b3063ebc7f1..5f0292d2c0d 100644 --- a/recipe/fully_async_policy/exp/qwen2-7B-math_32/fsdp2_fully-async_16-16/runtime_env.yaml +++ b/recipe/fully_async_policy/exp/qwen2-7B-math_32/fsdp2_fully-async_16-16/runtime_env.yaml @@ -1,4 +1,4 @@ env_vars: VLLM_USE_V1: "1" - TENSORBOARD_DIR: "/mnt/dolphinfs/ssd_pool/docker/user/hadoop-friday-studio/FTI/houzhenggang/expeirments/qwen2-7B-math/dapo_qwen2-7B-math_28k_fsdp2_fully-async_16-16_mbs32_tpf16_fixmcs" + TENSORBOARD_DIR: "/mnt/dolphinfs/ssd_pool/docker/user/hadoop-friday-studio/FTI/houzhenggang/expeirments/qwen2-7B-math/dapo_qwen2-7B-math_28k_fsdp2_fully-async_16-16_mbs32_tpf16-fsdpsize_8" HYDRA_FULL_ERROR: "1" \ No newline at end of file From d0a5142c7547e759829f5b0d1938d844f7a25600 Mon Sep 17 00:00:00 2001 From: wangshulin02 <953550366@qq.com> Date: Mon, 15 Sep 2025 15:10:28 +0800 Subject: [PATCH 131/182] extract modified files in verl/ --- .../fully_async_policy/agent_loop/__init__.py | 21 + .../agent_loop/agent_loop.py | 704 ++++++++ .../partial_single_turn_agent_loop.py | 74 + .../agent_loop/single_turn_agent_loop.py | 55 + .../agent_loop/vllm_async_server.py | 401 +++++ .../config/fully_async_ppo_trainer.yaml | 4 +- recipe/fully_async_policy/fsdp_workers.py | 7 +- recipe/fully_async_policy/fully_async_main.py | 10 +- .../fully_async_rollouter.py | 4 +- .../fully_async_policy/fully_async_trainer.py | 2 +- recipe/fully_async_policy/main_ppo.py | 344 ++++ recipe/fully_async_policy/ray_trainer.py | 1434 +++++++++++++++++ 12 files changed, 3045 insertions(+), 15 deletions(-) create mode 100644 recipe/fully_async_policy/agent_loop/__init__.py create mode 100644 recipe/fully_async_policy/agent_loop/agent_loop.py create mode 100644 recipe/fully_async_policy/agent_loop/partial_single_turn_agent_loop.py create mode 100644 recipe/fully_async_policy/agent_loop/single_turn_agent_loop.py create mode 100644 recipe/fully_async_policy/agent_loop/vllm_async_server.py create mode 100644 recipe/fully_async_policy/main_ppo.py create mode 100644 recipe/fully_async_policy/ray_trainer.py diff --git a/recipe/fully_async_policy/agent_loop/__init__.py b/recipe/fully_async_policy/agent_loop/__init__.py new file mode 100644 index 00000000000..7e583cb220d --- /dev/null +++ b/recipe/fully_async_policy/agent_loop/__init__.py @@ -0,0 +1,21 @@ +# Copyright 2024 Bytedance Ltd. and/or its affiliates +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from .agent_loop import AgentLoopBase, AgentLoopManager +from .partial_single_turn_agent_loop import PartialSingleTurnAgentLoop +from .single_turn_agent_loop import SingleTurnAgentLoop + +_ = [SingleTurnAgentLoop, PartialSingleTurnAgentLoop] + +__all__ = ["AgentLoopBase", "AgentLoopManager"] \ No newline at end of file diff --git a/recipe/fully_async_policy/agent_loop/agent_loop.py b/recipe/fully_async_policy/agent_loop/agent_loop.py new file mode 100644 index 00000000000..4e6c9ff9285 --- /dev/null +++ b/recipe/fully_async_policy/agent_loop/agent_loop.py @@ -0,0 +1,704 @@ +# Copyright 2024 Bytedance Ltd. and/or its affiliates +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import asyncio +import heapq +import logging +import os +import random +from abc import ABC, abstractmethod +from typing import Any, Optional + +import hydra +import numpy as np +import ray +import torch +from cachetools import LRUCache +from omegaconf import DictConfig, OmegaConf +from pydantic import BaseModel +from tensordict import TensorDict +from transformers import AutoTokenizer + +from verl.protocol import DataProto +from verl.single_controller.ray.base import RayWorkerGroup +from verl.utils import hf_tokenizer +from verl.utils.fs import copy_to_local +from verl.utils.rollout_trace import RolloutTraceConfig, rollout_trace_attr, rollout_trace_op + +logger = logging.getLogger(__file__) +logger.setLevel(os.getenv("VERL_LOGGING_LEVEL", "WARN")) + + +class AsyncLLMServerManager: + """ + A class to manage multiple OpenAI compatible LLM servers. This class provides + - Load balance: least requests load balancing + - Sticky session: send multi-turn chat completions to same server for automatic prefix caching + """ + + def __init__(self, config: DictConfig, server_handles: list[ray.actor.ActorHandle], max_cache_size: int = 10000): + """Initialize the AsyncLLMServerManager. + + Args: + config (DictConfig): YAML config. + server_handles (List[ray.actor.ActorHandle]): OpenAI compatible LLM server actor handles. + max_cache_size (int, optional): max cache size for request_id to server mapping. Defaults to 10000. + """ + self.config = config + self.server_handles = server_handles + random.shuffle(self.server_handles) + + # Least requests load balancing + self.weighted_serveres = [[0, (hash(server), server)] for server in server_handles] + heapq.heapify(self.weighted_serveres) + + # LRU cache to map request_id to server + self.request_id_to_server = LRUCache(maxsize=max_cache_size) + + def _choose_server(self, request_id: str) -> ray.actor.ActorHandle: + # TODO: implement server pressure awareness load balancing + if request_id in self.request_id_to_server: + return self.request_id_to_server[request_id] + + server = self.weighted_serveres[0][1][1] + self.weighted_serveres[0][0] += 1 + heapq.heapreplace(self.weighted_serveres, self.weighted_serveres[0]) + self.request_id_to_server[request_id] = server + return server + + @rollout_trace_op + async def generate( + self, + request_id, + *, + prompt_ids: list[int], + sampling_params: dict[str, Any], + ) -> list[int]: + """Generate tokens from prompt ids. + + Args: + request_id (str): request id for sticky session. + prompt_ids (List[int]): List of prompt token ids. + sampling_params (Dict[str, Any]): Sampling parameters for the chat completion. + + Returns: + List[int]: List of generated token ids. + """ + server = self._choose_server(request_id) + output = await server.generate.remote( + request_id=request_id, + prompt_ids=prompt_ids, + sampling_params=sampling_params, + ) + return output + + async def generate_for_partial(self, request_id, prompt_ids, sampling_params): + """Generate tokens from prompt ids. with partial rollout function""" + server = self._choose_server(request_id) + output = await server.generate_for_partial.remote( + request_id=request_id, + prompt_ids=prompt_ids, + sampling_params=sampling_params, + ) + return output + + +class AgentLoopMetrics(BaseModel): + """Agent loop performance metrics.""" + + generate_sequences: float = 0.0 + tool_calls: float = 0.0 + + +class AgentLoopOutput(BaseModel): + """Agent loop output.""" + + prompt_ids: list[int] + """Prompt token ids.""" + response_ids: list[int] + """Response token ids including LLM generated token, tool response token.""" + response_mask: list[int] + """Response mask, 1 for LLM generated token, 0 for tool response token.""" + num_turns: int = 0 + """Number of chat turns, including user, assistant, tool.""" + metrics: AgentLoopMetrics + """Auxiliary performance metrics""" + is_cancel: bool = False + """Indicates whether the request was interrupted""" + log_probs: list[float] = None + """Response token log probs including LLM generated token, tool response token.""" + + +# make hydra.utils.instantiate happy +class _DummyConfig: + def __init__(self, config: DictConfig) -> None: + self.config = config + + +class AgentLoopBase(ABC): + """An agent loop takes a input message, chat with OpenAI compatible LLM server and interact with various + environments.""" + + _class_initialized = False + + def __init__( + self, trainer_config: _DummyConfig, server_manager: AsyncLLMServerManager, tokenizer: AutoTokenizer, **kwargs + ): + """Initialize agent loop, each sample will have its own loop instance. + + Args: + trainer_config (_DummyConfig): trainer config. + server_manager (AsyncLLMServerManager): OpenAI compatible LLM server manager. + tokenizer (AutoTokenizer): Tokenizer for tokenize messages. + """ + self.init_class(trainer_config.config, tokenizer, **kwargs) + self.config = trainer_config.config + self.server_manager = server_manager + self.tokenizer = tokenizer + self.loop = asyncio.get_running_loop() + + @classmethod + def init_class(cls, config: DictConfig, tokenizer: AutoTokenizer, **kwargs): + """This is used to do heavy initialization work that should shared across all instances. It's only called once. + + Args: + config (DictConfig): trainer config. + tokenizer (AutoTokenizer): Tokenizer for tokenize messages. + **kwargs: extra kwargs from config file passed in by `hydra.utils.instantiate`. + """ + if cls._class_initialized: + return + cls._class_initialized = True + + @abstractmethod + async def run(self, messages: list[dict[str, Any]], sampling_params: dict[str, Any]) -> AgentLoopOutput: + """Run agent loop to interact with LLM server and environment. + + Args: + messages (List[Dict[str, Any]]): Input messages. + sampling_params (Dict[str, Any]): LLM sampling params. + + Returns: + AgentLoopOutput: Agent loop output. + """ + raise NotImplementedError + + +"""Agent loop registry: key is agent_name, value is a dict of agent loop config +used by hydra.utils.instantiate to initialize agent loop instance. + +https://hydra.cc/docs/advanced/instantiate_objects/overview/ +""" +_agent_loop_registry: dict[str, dict] = {} + + +def register(agent_name: str): + """Register agent loop class.""" + + def decorator(subclass: type[AgentLoopBase]) -> type[AgentLoopBase]: + fqdn = f"{subclass.__module__}.{subclass.__qualname__}" + _agent_loop_registry[agent_name] = {"_target_": fqdn} + return subclass + + return decorator + + +def postprocess_agent_loop_outputs(inputs: list[AgentLoopOutput], tokenizer, config) -> DataProto: + """Static method to postprocess a list of AgentLoopOutput into DataProto + + Args: + inputs: List of AgentLoopOutput + tokenizer: Tokenizer instance + config: Configuration object + + Returns: + DataProto: Processed batch data + """ + # NOTE: consistent with batch version of generate_sequences in vllm_rollout_spmd.py + # prompts: left pad + # responses: right pad + # input_ids: prompt + response + # attention_mask: [0,0,0,0,1,1,1,1, | 1,1,1,0,0,0,0,0] + # position_ids: [0,0,0,0,0,1,2,3, | 4,5,6,7,8,9,10,11] + + # prompts + tokenizer.padding_side = "left" + outputs = tokenizer.pad( + [{"input_ids": input.prompt_ids} for input in inputs], + padding="max_length", + max_length=config.actor_rollout_ref.rollout.prompt_length, + return_tensors="pt", + return_attention_mask=True, + ) + prompt_ids, prompt_attention_mask = outputs["input_ids"], outputs["attention_mask"] + + # responses + tokenizer.padding_side = "right" + outputs = tokenizer.pad( + [{"input_ids": input.response_ids} for input in inputs], + padding="max_length", + max_length=config.actor_rollout_ref.rollout.response_length, + return_tensors="pt", + return_attention_mask=True, + ) + response_ids, response_attention_mask = outputs["input_ids"], outputs["attention_mask"] + + # response_mask + outputs = tokenizer.pad( + [{"input_ids": input.response_mask} for input in inputs], + padding="max_length", + max_length=config.actor_rollout_ref.rollout.response_length, + return_tensors="pt", + return_attention_mask=False, + ) + response_mask = outputs["input_ids"] + assert response_ids.shape == response_mask.shape, ( + f"mismatch in response_ids and response_mask shape: {response_ids.shape} vs {response_mask.shape}" + ) + response_mask = response_mask * response_attention_mask + + input_ids = torch.cat([prompt_ids, response_ids], dim=1) + attention_mask = torch.cat([prompt_attention_mask, response_attention_mask], dim=1) + position_ids = (attention_mask.cumsum(dim=1) - 1) * attention_mask + + batch = TensorDict( + { + "prompts": prompt_ids, # [bsz, prompt_length] + "responses": response_ids, # [bsz, response_length] + "response_mask": response_mask, # [bsz, response_length] + "input_ids": input_ids, # [bsz, prompt_length + response_length] + "attention_mask": attention_mask, # [bsz, prompt_length + response_length] + "position_ids": position_ids, # [bsz, prompt_length + response_length] + }, + batch_size=len(input_ids), + ) + + num_turns = np.array([input.num_turns for input in inputs], dtype=np.int32) + metrics = [input.metrics.model_dump() for input in inputs] + return DataProto(batch=batch, non_tensor_batch={"__num_turns__": num_turns}, meta_info={"metrics": metrics}) + + +@ray.remote +class AgentLoopWorker: + """Agent loop worker takes a batch of messages and run each message in an agent loop.""" + + def __init__(self, config: DictConfig, server_handles: list[ray.actor.ActorHandle]): + """Initialize agent loop manager. + + Args: + config (DictConfig): YAML config. + server_handles (List[ray.actor.ActorHandle]): OpenAI compatible LLM server actor handles. + """ + self.config = config + self.server_manager = AsyncLLMServerManager(config, server_handles) + + model_path = config.actor_rollout_ref.model.path + self.model_name = "/".join(model_path.split("/")[-2:]) + local_path = copy_to_local(config.actor_rollout_ref.model.path) + self.tokenizer = hf_tokenizer(local_path, trust_remote_code=True) + + agent_loop_config_path = config.actor_rollout_ref.rollout.agent.agent_loop_config_path + if agent_loop_config_path: + agent_loop_configs = OmegaConf.load(agent_loop_config_path) + for agent_loop_config in agent_loop_configs: + _agent_loop_registry[agent_loop_config.name] = agent_loop_config + + trace_config = config.trainer.get("rollout_trace", {}) + trace_config = self.config.actor_rollout_ref.rollout.get("trace", {}) + RolloutTraceConfig.init( + self.config.trainer.project_name, + self.config.trainer.experiment_name, + trace_config.get("backend"), + trace_config.get("token2text", False), + ) + + async def generate_sequences(self, batch: DataProto) -> DataProto: + """Generate sequences from agent loop. + + Args: + batch (DataProto): Input batch. + + Returns: + DataProto: Output batch. + - prompts: [bsz, prompt_length], prompt token ids from dataset. + - responses: [bsz, response_length], output token ids include response tokens + from LLM generation and observation tokens from tool_calls. + - response_mask: [bsz, response_length], 1 for LLM generated tokens, 0 for observation/padding tokens. + - input_ids: [bsz, prompt_length + response_length], whole sequence token ids, including prompt tokens + and response tokens. + - attention_mask: [bsz, prompt_length + response_length], 0 for padding tokens, 1 for other tokens. + - position_ids: [bsz, prompt_length + response_length], incremental position ids. + + For multi-turn conversations: + responses: |<- LLM generation ->|<- tool_calls ->|<- LLM generation ->|<- padding ->| + response_mask: | 1, 1, 1, ..., 1, 1 | 0, 0, .., 0, 0 | 1, 1, 1, ..., 1, 1 | 0, 0, ..., 0| + """ + config = self.config.actor_rollout_ref.rollout + sampling_params = dict( + temperature=config.temperature, + top_p=config.top_p, + repetition_penalty=1.0, + ) + + # override sampling params for validation + if batch.meta_info.get("validate", False): + sampling_params["top_p"] = config.val_kwargs.top_p + sampling_params["temperature"] = config.val_kwargs.temperature + + # by default, we assume it's a single turn agent + if "agent_name" not in batch.non_tensor_batch: + batch.non_tensor_batch["agent_name"] = np.array(["single_turn_agent"] * len(batch), dtype=object) + + tasks = [] + agent_names = batch.non_tensor_batch["agent_name"] + raw_prompts = batch.non_tensor_batch["raw_prompt"] + if "index" in batch.non_tensor_batch: + index = batch.non_tensor_batch["index"] + else: + index = np.arange(len(raw_prompts)) + + trajectory_info = await get_trajectory_info( + batch.meta_info.get("global_steps", -1), index, batch.meta_info.get("validate", False) + ) + + for agent_name, messages, trajectory in zip(agent_names, raw_prompts, trajectory_info, strict=True): + tasks.append( + asyncio.create_task(self._run_agent_loop(agent_name, messages.tolist(), sampling_params, trajectory)) + ) + outputs = await asyncio.gather(*tasks) + + output = postprocess_agent_loop_outputs(outputs, self.tokenizer, self.config) + return output + + async def generate_sequences_no_post( + self, batch: DataProto, partial_output_list: Optional[list[AgentLoopOutput]] + ) -> list[AgentLoopOutput]: + """Generate sequences from agent loop. + + Args: + batch (DataProto): Input batch. + partial_output_list: Optional[List[AgentLoopOutput]]: already rollout result. + + Returns: + list[AgentLoopOutput]: List of agent loop outputs, one per sample in the batch. + Each AgentLoopOutput contains: + - prompt_ids: prompt token ids + - response_ids: response token ids including LLM generated and tool response tokens + - response_mask: 1 for LLM generated tokens, 0 for tool response tokens + - num_turns: number of chat turns + - metrics: performance metrics + """ + config = self.config.actor_rollout_ref.rollout + sampling_params = dict( + temperature=config.temperature, + top_p=config.top_p, + repetition_penalty=1.0, + ) + + # override sampling params for validation + if batch.meta_info.get("validate", False): + sampling_params["top_p"] = config.val_kwargs.top_p + sampling_params["temperature"] = config.val_kwargs.temperature + + # by default, we assume it's a single turn agent + if "agent_name" not in batch.non_tensor_batch: + batch.non_tensor_batch["agent_name"] = np.array(["single_turn_agent"] * len(batch), dtype=object) + + tasks = [] + agent_names = batch.non_tensor_batch["agent_name"] + raw_prompts = batch.non_tensor_batch["raw_prompt"] + if "index" in batch.non_tensor_batch: + index = batch.non_tensor_batch["index"] + else: + index = np.arange(len(raw_prompts)) + + trajectory_info = await get_trajectory_info( + batch.meta_info.get("global_steps", -1), index, batch.meta_info.get("validate", False) + ) + if not partial_output_list: + partial_output_list = [None] * len(batch) + + for agent_name, messages, trajectory, partial_output in zip( + agent_names, raw_prompts, trajectory_info, partial_output_list, strict=True + ): + tasks.append( + asyncio.create_task( + self._run_agent_loop(agent_name, messages.tolist(), sampling_params, trajectory, partial_output) + ) + ) + outputs = await asyncio.gather(*tasks) + + return outputs + + async def _run_agent_loop( + self, + agent_name: str, + messages: list[dict[str, Any]], + sampling_params: dict[str, Any], + trajectory: dict[str, Any], + partial_output: Optional[AgentLoopOutput] = None, + ) -> AgentLoopOutput: + with rollout_trace_attr( + step=trajectory["step"], + sample_index=trajectory["sample_index"], + rollout_n=trajectory["rollout_n"], + validate=trajectory["validate"], + name="agent_loop", + ): + assert agent_name in _agent_loop_registry, ( + f"Agent loop {agent_name} not registered, registered agent loops: {_agent_loop_registry.keys()}" + ) + agent_loop_config = _agent_loop_registry[agent_name] + agent_loop = hydra.utils.instantiate( + config=agent_loop_config, + trainer_config=_DummyConfig(config=self.config), + server_manager=self.server_manager, + tokenizer=self.tokenizer, + ) + if agent_name == "partial_single_turn_agent": + output = await agent_loop.run(messages, sampling_params, partial_output) + else: + output = await agent_loop.run(messages, sampling_params) + return output + + +async def get_trajectory_info(step, index, validate): + """Get trajectory info. + + Args: + step (int): global steps in the trainer. + index (list): form datastore extra_info.index column. + validate (bool): whether is a validate step. + + Returns: + list: trajectory. + """ + trajectory_info = [] + rollout_n = 0 + for i in range(len(index)): + if i > 0 and index[i - 1] == index[i]: + rollout_n += 1 + else: + rollout_n = 0 + trajectory_info.append({"step": step, "sample_index": index[i], "rollout_n": rollout_n, "validate": validate}) + return trajectory_info + + +class AgentLoopManager: + """Agent loop manager that manages a group of agent loop workers.""" + + def __init__(self, config: DictConfig, worker_group: RayWorkerGroup): + """Initialize agent loop manager. + + Args: + config (DictConfig): trainer config. + worker_group (RayWorkerGroup): AsyncActorRolloutRefWorker worker group. + """ + self.config = config + self.worker_group = worker_group + + self._initialize_llm_servers() + self._init_agent_loop_workers() + + # Initially we're in sleep mode. + self.sleep() + + def _initialize_llm_servers(self): + self.rollout_tp_size = self.config.actor_rollout_ref.rollout.tensor_model_parallel_size + self.rollout_dp_size = self.worker_group.world_size // self.rollout_tp_size + + register_center = ray.get_actor(f"{self.worker_group.name_prefix}_register_center") + workers_info = ray.get(register_center.get_worker_info.remote()) + assert len(workers_info) == self.worker_group.world_size + + self.async_llm_servers = [None] * self.rollout_dp_size + self.server_addresses = [None] * self.rollout_dp_size + + if self.config.actor_rollout_ref.rollout.agent.custom_async_server: + server_class = async_server_class( + rollout_backend=self.config.actor_rollout_ref.rollout.name, + rollout_backend_module=self.config.actor_rollout_ref.rollout.agent.custom_async_server.path, + rollout_backend_class=self.config.actor_rollout_ref.rollout.agent.custom_async_server.name, + ) + else: + server_class = async_server_class(rollout_backend=self.config.actor_rollout_ref.rollout.name) + + # Start all server instances, restart if address already in use. + unready_dp_ranks = set(range(self.rollout_dp_size)) + while len(unready_dp_ranks) > 0: + servers = { + rollout_dp_rank: server_class.options( + # make sure AsyncvLLMServer colocates with its corresponding workers + scheduling_strategy=ray.util.scheduling_strategies.NodeAffinitySchedulingStrategy( + node_id=workers_info[rollout_dp_rank * self.rollout_tp_size], + soft=False, + ), + name=f"async_llm_server_{rollout_dp_rank}", + ).remote(self.config, self.rollout_dp_size, rollout_dp_rank, self.worker_group.name_prefix) + for rollout_dp_rank in unready_dp_ranks + } + + for rollout_dp_rank, server in servers.items(): + try: + address = ray.get(server.get_server_address.remote()) + self.server_addresses[rollout_dp_rank] = address + self.async_llm_servers[rollout_dp_rank] = server + unready_dp_ranks.remove(rollout_dp_rank) + except Exception: + ray.kill(server) + print(f"rollout server {rollout_dp_rank} failed, maybe address already in use, restarting...") + + # All server instances are ready, init AsyncLLM engine. + ray.get([server.init_engine.remote() for server in self.async_llm_servers]) + + def _init_agent_loop_workers(self): + self.agent_loop_workers = [] + for i in range(self.config.actor_rollout_ref.rollout.agent.num_workers): + self.agent_loop_workers.append( + AgentLoopWorker.options( + name=f"agent_loop_worker_{i}", + ).remote(self.config, self.async_llm_servers) + ) + + def generate_sequences(self, prompts: DataProto) -> DataProto: + """Split input batch and dispatch to agent loop workers. + + Args: + prompts (DataProto): Input batch. + + Returns: + DataProto: Output batch. + """ + if self.config.actor_rollout_ref.rollout.free_cache_engine: + self.wake_up() + chunkes = prompts.chunk(len(self.agent_loop_workers)) + outputs = ray.get( + [ + worker.generate_sequences.remote(chunk) + for worker, chunk in zip(self.agent_loop_workers, chunkes, strict=True) + ] + ) + output = DataProto.concat(outputs) + if self.config.actor_rollout_ref.rollout.free_cache_engine: + self.sleep() + + # calculate performance metrics + metrics = [output.meta_info["metrics"] for output in outputs] # List[List[Dict[str, str]]] + timing = self._performance_metrics(metrics, output) + + output.meta_info = {"timing": timing} + return output + + async def generate_single_sample_async( + self, + sample: DataProto, + partial_output_list: Optional[list[AgentLoopOutput]], + ) -> list[AgentLoopOutput]: + """ + 异步处理单个样本, 需要复制n次 + + Args: + sample: 单个样本数据 + partial_output_list: Optional[List[AgentLoopOutput]]: already rollout result. + + Returns: + tuple[AgentLoopOutput, float]: 处理结果和处理时间 + """ + # 使用负载均衡选择 worker + worker = self._select_best_worker() + # 异步处理单个样本 - 使用无后处理版本获取原始AgentLoopOutput + output_future = worker.generate_sequences_no_post.remote(sample, partial_output_list) + return await asyncio.wrap_future(output_future.future()) + + def _select_best_worker(self): + """选择最佳的 worker(简单的轮询负载均衡)""" + if not hasattr(self, "_worker_index"): + self._worker_index = 0 + + worker = self.agent_loop_workers[self._worker_index] + self._worker_index = (self._worker_index + 1) % len(self.agent_loop_workers) + return worker + + def _performance_metrics(self, metrics: list[list[dict[str, str]]], output: DataProto) -> dict[str, float]: + timing = {} + t_generate_sequences = np.array([metric["generate_sequences"] for chunk in metrics for metric in chunk]) + t_tool_calls = np.array([metric["tool_calls"] for chunk in metrics for metric in chunk]) + timing["agent_loop/generate_sequences/min"] = t_generate_sequences.min() + timing["agent_loop/generate_sequences/max"] = t_generate_sequences.max() + timing["agent_loop/generate_sequences/mean"] = t_generate_sequences.mean() + timing["agent_loop/tool_calls/min"] = t_tool_calls.min() + timing["agent_loop/tool_calls/max"] = t_tool_calls.max() + timing["agent_loop/tool_calls/mean"] = t_tool_calls.mean() + + # batch sequence generation is bounded by the slowest sample + slowest = np.argmax(t_generate_sequences + t_tool_calls) + attention_mask = output.batch["attention_mask"][slowest] + prompt_length = output.batch["prompts"].shape[1] + timing["agent_loop/slowest/generate_sequences"] = t_generate_sequences[slowest] + timing["agent_loop/slowest/tool_calls"] = t_tool_calls[slowest] + timing["agent_loop/slowest/prompt_length"] = attention_mask[:prompt_length].sum().item() + timing["agent_loop/slowest/response_length"] = attention_mask[prompt_length:].sum().item() + + return timing + + def wake_up(self): + """Wake up all rollout server instances.""" + ray.get([server.wake_up.remote() for server in self.async_llm_servers]) + + def sleep(self): + """Sleep all rollout server instances.""" + ray.get([server.sleep.remote() for server in self.async_llm_servers]) + + async def cancel_async(self): + """Cancel all rollout tasks asynchronously.""" + futures = [server.cancel.remote() for server in self.async_llm_servers] + await asyncio.gather(*[asyncio.wrap_future(future.future()) for future in futures], return_exceptions=True) + + async def resume_async(self): + """Cancel all rollout tasks asynchronously.""" + futures = [server.resume.remote() for server in self.async_llm_servers] + await asyncio.gather(*[asyncio.wrap_future(future.future()) for future in futures], return_exceptions=True) + + + +from verl.workers.rollout.async_server import AsyncServerBase +def async_server_class( + rollout_backend: str, rollout_backend_module: Optional[str] = None, rollout_backend_class: Optional[str] = None +) -> type[AsyncServerBase]: + """Get async server class. + + Args: + rollout_backend: str, rollout backend type (alias), should be "vllm". + rollout_backend_module: Optional[str], import path of the rollout backend. + rollout_backend_class: Optional[str], class name of the rollout backend. + + Returns: + Type[AsyncServerBase]: async server class. + """ + if rollout_backend_class is None and rollout_backend_module is None: + # If both are None, use the default backend class + # Do not change the original import behavior + # importlib.import_module and from ... import ... have subtle differences in ray + + if rollout_backend == "vllm": + from recipe.fully_async_policy.agent_loop.vllm_async_server import AsyncvLLMServer + return AsyncvLLMServer + else: + raise NotImplementedError(f"rollout backend {rollout_backend} is not supported") + + if rollout_backend_module is None or rollout_backend_class is None: + raise ValueError("rollout_backend_module and rollout_backend_class must be both provided for customization") + + from verl.utils.import_utils import load_extern_type + + return load_extern_type(rollout_backend_module, rollout_backend_class) \ No newline at end of file diff --git a/recipe/fully_async_policy/agent_loop/partial_single_turn_agent_loop.py b/recipe/fully_async_policy/agent_loop/partial_single_turn_agent_loop.py new file mode 100644 index 00000000000..cf95c1eb965 --- /dev/null +++ b/recipe/fully_async_policy/agent_loop/partial_single_turn_agent_loop.py @@ -0,0 +1,74 @@ +# Copyright 2024 Bytedance Ltd. and/or its affiliates +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import logging +import os +from typing import Any, Optional +from uuid import uuid4 + +from recipe.fully_async_policy.agent_loop.agent_loop import AgentLoopBase, AgentLoopOutput, register +from verl.utils.profiler import simple_timer + +logger = logging.getLogger(__file__) +logger.setLevel(os.getenv("VERL_LOGGING_LEVEL", "WARN")) + + +@register("partial_single_turn_agent") +class PartialSingleTurnAgentLoop(AgentLoopBase): + """Naive agent loop that only do single turn chat completion.""" + + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + self.prompt_length = self.config.actor_rollout_ref.rollout.prompt_length + self.response_length = self.config.actor_rollout_ref.rollout.response_length + + async def run( + self, messages: list[dict[str, Any]], sampling_params: dict[str, Any], output: Optional[AgentLoopOutput] + ) -> AgentLoopOutput: + if not output: + prompt_ids = await self.loop.run_in_executor( + None, lambda: self.tokenizer.apply_chat_template(messages, add_generation_prompt=True, tokenize=True) + ) + else: + if output.is_cancel: + # 恢复暂停的样本,结果直接添加到 prompt_ids 后面 + prompt_ids = output.prompt_ids + output.response_ids + else: + # 同一批样本,部分cancel,部分没有cancel, 没有cancel的样本直接返回 + return output + + metrics = {} + request_id = uuid4().hex + with simple_timer("generate_sequences", metrics): + response_ids, log_probs, is_cancel = await self.server_manager.generate_for_partial( + request_id=request_id, prompt_ids=prompt_ids, sampling_params=sampling_params + ) + + if not output: + response_mask = [1] * len(response_ids) + # 暂停待恢复样本, 把输出结果加到 response_ids 后,并重置 response_mask + else: + prompt_ids = output.prompt_ids + log_probs = output.log_probs + log_probs + response_ids = output.response_ids + response_ids + response_mask = [1] * len(response_ids) + + return AgentLoopOutput( + prompt_ids=prompt_ids, + response_ids=response_ids[: self.response_length], + response_mask=response_mask[: self.response_length], + num_turns=2, + metrics=metrics, + is_cancel=is_cancel, + log_probs=log_probs, + ) diff --git a/recipe/fully_async_policy/agent_loop/single_turn_agent_loop.py b/recipe/fully_async_policy/agent_loop/single_turn_agent_loop.py new file mode 100644 index 00000000000..6dcdf327b09 --- /dev/null +++ b/recipe/fully_async_policy/agent_loop/single_turn_agent_loop.py @@ -0,0 +1,55 @@ +# Copyright 2024 Bytedance Ltd. and/or its affiliates +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import logging +import os +from typing import Any +from uuid import uuid4 + +from recipe.fully_async_policy.agent_loop.agent_loop import AgentLoopBase, AgentLoopOutput, register +from verl.utils.profiler import simple_timer + +logger = logging.getLogger(__file__) +logger.setLevel(os.getenv("VERL_LOGGING_LEVEL", "WARN")) + + +@register("single_turn_agent") +class SingleTurnAgentLoop(AgentLoopBase): + """Naive agent loop that only do single turn chat completion.""" + + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + self.prompt_length = self.config.actor_rollout_ref.rollout.prompt_length + self.response_length = self.config.actor_rollout_ref.rollout.response_length + + async def run(self, messages: list[dict[str, Any]], sampling_params: dict[str, Any]) -> AgentLoopOutput: + metrics = {} + request_id = uuid4().hex + prompt_ids = await self.loop.run_in_executor( + None, lambda: self.tokenizer.apply_chat_template(messages, add_generation_prompt=True, tokenize=True) + ) + + with simple_timer("generate_sequences", metrics): + response_ids = await self.server_manager.generate( + request_id=request_id, prompt_ids=prompt_ids, sampling_params=sampling_params + ) + response_mask = [1] * len(response_ids) + + output = AgentLoopOutput( + prompt_ids=prompt_ids, + response_ids=response_ids[: self.response_length], + response_mask=response_mask[: self.response_length], + num_turns=2, + metrics=metrics, + ) + return output diff --git a/recipe/fully_async_policy/agent_loop/vllm_async_server.py b/recipe/fully_async_policy/agent_loop/vllm_async_server.py new file mode 100644 index 00000000000..03fc28c8549 --- /dev/null +++ b/recipe/fully_async_policy/agent_loop/vllm_async_server.py @@ -0,0 +1,401 @@ +# Copyright 2024 Bytedance Ltd. and/or its affiliates +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import asyncio +import logging +import os +import pickle +from typing import Any, Callable, Optional, Sequence + +import ray +import zmq +from omegaconf import DictConfig +from starlette.requests import Request +from starlette.responses import JSONResponse, StreamingResponse +from vllm import SamplingParams +from vllm.engine.arg_utils import AsyncEngineArgs +from vllm.entrypoints.logger import RequestLogger +from vllm.entrypoints.openai.protocol import ChatCompletionRequest, ChatCompletionResponse, ErrorResponse +from vllm.entrypoints.openai.serving_chat import OpenAIServingChat +from vllm.entrypoints.openai.serving_models import BaseModelPath, OpenAIServingModels +from vllm.inputs import TokensPrompt +from vllm.outputs import RequestOutput +from vllm.v1.engine.async_llm import AsyncLLM +from vllm.v1.executor.abstract import Executor +from vllm.worker.worker_base import WorkerWrapperBase + +from verl.utils.fs import copy_to_local +from verl.workers.rollout.async_server import AsyncServerBase + +logger = logging.getLogger(__file__) + + +def _get_model_runner_workers(vllm_config, init_ray: bool = True): + assert vllm_config.instance_id is not None, "instance_id must be set for external ray actors." + + fields = vllm_config.instance_id.split(":") + assert len(fields) == 4, ( + f"instance_id: {vllm_config.instance_id} must be in the format of " + f":::." + ) + namespace, wg_prefix, vllm_dp_size, vllm_dp_rank = fields[0], fields[1], int(fields[2]), int(fields[3]) + + # Make sure subprocess in same namespace as parent actor. + # actor name format: {name_prefix}WorkerDict_{pg_idx}:{local_rank} + if init_ray: + ray.init(namespace=namespace) + actor_names = [ + actor_name for actor_name in ray.util.list_named_actors() if actor_name.startswith(f"{wg_prefix}WorkerDict") + ] + + vllm_tp_size = vllm_config.parallel_config.tensor_parallel_size + assert len(actor_names) == vllm_dp_size * vllm_tp_size, ( + f"instance_id: {vllm_config.instance_id} has {len(actor_names)} actors, but vllm_dp_size: " + f"{vllm_dp_size} * vllm_tp_size: {vllm_tp_size} = {vllm_dp_size * vllm_tp_size} is expected." + ) + + def get_pg_index_and_local_rank(actor_name) -> tuple[int, int]: + fields = actor_name.split(":") + assert len(fields) == 2, f"invalid actor name: {actor_name}" + pg_index, local_rank = int(fields[0].split("_")[-1]), int(fields[1]) + return pg_index, local_rank + + # sort actor names by pg_index and local_rank + actor_names = sorted(actor_names, key=get_pg_index_and_local_rank) + actor_names = actor_names[vllm_dp_rank * vllm_tp_size : (vllm_dp_rank + 1) * vllm_tp_size] + workers: list[WorkerWrapperBase] = [ray.get_actor(actor_name) for actor_name in actor_names] + print(f"instance_id: {vllm_config.instance_id} initializes with external actors: {actor_names}") + + return workers + + +class ExternalRayDistributedExecutor(Executor): + """An executor that engines are launched by external ray actors.""" + + uses_ray: bool = False + + def _init_executor(self) -> None: + self.workers = _get_model_runner_workers(vllm_config=self.vllm_config, init_ray=True) + + kwargs = dict( + vllm_config=self.vllm_config, + local_rank=None, + rank=None, + distributed_init_method="env://", + is_driver_worker=True, + ) + self.collective_rpc("init_worker", args=([kwargs],)) + self.collective_rpc("init_device") + self.collective_rpc("load_model") + print(f"instance_id: {self.vllm_config.instance_id} initializes finished.") + + def collective_rpc( + self, + method: str | Callable, + timeout: Optional[float] = None, + args: tuple = (), + kwargs: Optional[dict[str, Any]] = None, + ) -> list[Any]: + # TODO(wuxibin): support ray compiled graph + if isinstance(method, str): + sent_method = method + else: + sent_method = pickle.dumps(method) + del method + + # ~3ms overhead per schedule step due to SchedulerOutput/ModelRunnerOutput serialization/deserialization. + outputs = ray.get( + [worker.execute_method.remote(sent_method, *args, **(kwargs or {})) for worker in self.workers] + ) + return outputs + + def check_health(self): + return + + +class ExternalZeroMQDistributedExecutor(Executor): + """An executor that engines are launched by external ray actors.""" + + uses_ray: bool = False + + def _init_executor(self) -> None: + addresses = os.environ["VERL_VLLM_ZMQ_ADDRESSES"].split(",") + self.context = zmq.Context() + self.sockets = [] + for address in addresses: + socket = self.context.socket(zmq.REQ) + socket.connect(address) + self.sockets.append(socket) + + kwargs = dict( + vllm_config=self.vllm_config, + local_rank=None, + rank=None, + distributed_init_method="env://", + is_driver_worker=True, + ) + self.collective_rpc("init_worker", args=([kwargs],)) + self.collective_rpc("init_device") + self.collective_rpc("load_model") + + def collective_rpc( + self, + method: str | Callable, + timeout: Optional[float] = None, + args: tuple = (), + kwargs: Optional[dict[str, Any]] = None, + ) -> list[Any]: + if isinstance(method, str): + sent_method = method + else: + sent_method = pickle.dumps(method) + del method + + message = pickle.dumps((sent_method, args, kwargs or {})) + for socket in self.sockets: + socket.send(message, zmq.DONTWAIT) + + outputs = [] + for socket in self.sockets: + outputs.append(pickle.loads(socket.recv())) + return outputs + + def check_health(self): + return + + +@ray.remote(num_cpus=1) +class AsyncvLLMServer(AsyncServerBase): + """ + AsyncvLLMServer is a wrapper for AsyncLLM, it uses ExternalRayDistributedExecutor to launch engines + in hybrid rollout workers, i.e AsyncActorRolloutRefWorker. + + AsyncvLLMServer works as follows: + 1. Start FastAPI server first. + 2. Initialize AsyncLLM with ExternalRayDistributedExecutor. + 3. AsyncLLM spawn EngineCore in subprocess. + 4. EngineCore initialize ExternalRayDistributedExecutor. + 5. ExternalRayDistributedExecutor lookup its corresponding actors by name. + 6. ExternalRayDistributedExecutor init executor: init_worker, init_device, load_model. + + For vLLM AsyncLLM design, see: https://github.com/vllm-project/vllm/pull/9826 + """ + + def __init__(self, config: DictConfig, vllm_dp_size: int, vllm_dp_rank: int, wg_prefix: str): + """ + Args: + config: DictConfig. + vllm_dp_size: int, vllm data parallel size. + vllm_dp_rank: int, vllm data parallel rank. + wg_prefix: str, worker group prefix, used to lookup actors. + """ + super().__init__() + + self.config = config.actor_rollout_ref + self.vllm_dp_size = vllm_dp_size + self.vllm_dp_rank = vllm_dp_rank + self.wg_prefix = wg_prefix + self.engine: AsyncLLM = None + + # for cancel LLMServer + self.paused = False + self.lock = asyncio.Lock() + self.cancel_event: dict[str, asyncio.Event] = {} + self.req_output: dict[str, Optional[RequestOutput]] = {} + + async def init_engine(self): + """Init vLLM AsyncLLM engine.""" + config = self.config + model_path = config.model.path + model_name = "/".join(model_path.split("/")[-2:]) + local_path = copy_to_local(model_path) + trust_remote_code = config.model.get("trust_remote_code", False) + config = config.rollout + + tensor_parallel_size = config.get("tensor_model_parallel_size", 1) + max_num_batched_tokens = config.get("max_num_batched_tokens", 8192) + max_model_len = config.max_model_len if config.max_model_len else config.prompt_length + config.response_length + self.max_model_len = int(max_model_len) + + # Override default generation config from hugging face model config, + # user can still override them by passing kwargs in each request. + kwargs = dict( + n=1, + logprobs=0, + repetition_penalty=1.0, + max_new_tokens=config.response_length, + ) + for k in config.keys(): + if hasattr(SamplingParams(), str(k)): + kwargs[k] = config.get(k) + print(f"override_generation_config: {kwargs}") + + backend = os.environ.get("VERL_VLLM_DISTRIBUTED_BACKEND", "zeromq") + if backend == "zeromq": + distributed_executor_backend = ExternalZeroMQDistributedExecutor + elif backend == "ray": + distributed_executor_backend = ExternalRayDistributedExecutor + else: + distributed_executor_backend = None + + engine_args = AsyncEngineArgs( + model=local_path, + enable_sleep_mode=config.free_cache_engine, + override_generation_config=kwargs, + tensor_parallel_size=tensor_parallel_size, + distributed_executor_backend=distributed_executor_backend, + dtype=config.dtype, + enforce_eager=config.enforce_eager, + gpu_memory_utilization=config.gpu_memory_utilization, + disable_custom_all_reduce=True, + skip_tokenizer_init=False, + max_model_len=self.max_model_len, + load_format="auto", + disable_log_stats=config.disable_log_stats, + max_num_batched_tokens=max_num_batched_tokens, + enable_chunked_prefill=config.enable_chunked_prefill, + enable_prefix_caching=True, + trust_remote_code=trust_remote_code, + seed=config.get("seed", 0), + ) + + # init async llm engine + vllm_config = self._create_engine_config(engine_args) + self.engine = AsyncLLM.from_vllm_config(vllm_config) + + # build serving chat + model_config = self.engine.model_config + BASE_MODEL_PATHS = [BaseModelPath(name=model_name, model_path=model_path)] + models = OpenAIServingModels(self.engine, model_config, BASE_MODEL_PATHS) + self.openai_serving_chat = OpenAIServingChat( + self.engine, + model_config, + models, + "assistant", + request_logger=RequestLogger(max_log_len=4096), + chat_template=None, + chat_template_content_format="auto", + enable_auto_tools=config.multi_turn.tool_config_path is not None, + tool_parser=config.multi_turn.format, # hermes, llama3_json, ... + ) + + def _create_engine_config(self, engine_args: AsyncEngineArgs): + vllm_config = engine_args.create_engine_config() + namespace = ray.get_runtime_context().namespace + vllm_config.instance_id = f"{namespace}:{self.wg_prefix}:{self.vllm_dp_size}:{self.vllm_dp_rank}" + + # VERL_VLLM_ZMQ_ADDRESSES + if engine_args.distributed_executor_backend == ExternalZeroMQDistributedExecutor: + workers = _get_model_runner_workers(vllm_config=vllm_config, init_ray=False) + zmq_addresses = ray.get([worker.get_zeromq_address.remote() for worker in workers]) + print(f"VERL_VLLM_ZMQ_ADDRESSES: {zmq_addresses}") + os.environ["VERL_VLLM_ZMQ_ADDRESSES"] = ",".join(zmq_addresses) + + return vllm_config + + async def chat_completion(self, raw_request: Request): + """OpenAI-compatible HTTP endpoint. + + API reference: https://docs.vllm.ai/en/latest/serving/openai_compatible_server.html + """ + request_json = await raw_request.json() + request = ChatCompletionRequest(**request_json) + generator = await self.openai_serving_chat.create_chat_completion(request, raw_request) + + if isinstance(generator, ErrorResponse): + return JSONResponse(content=generator.model_dump(), status_code=generator.code) + if request.stream: + return StreamingResponse(content=generator, media_type="text/event-stream") + else: + assert isinstance(generator, ChatCompletionResponse) + return JSONResponse(content=generator.model_dump()) + + async def generate(self, prompt_ids: list[int], sampling_params: dict[str, Any], request_id: str) -> list[int]: + max_tokens = self.max_model_len - len(prompt_ids) + sampling_params = SamplingParams(max_tokens=max_tokens, **sampling_params) + prompt = TokensPrompt(prompt_token_ids=prompt_ids) + generator = self.engine.generate(prompt=prompt, sampling_params=sampling_params, request_id=request_id) + + # Get final response + final_res: Optional[RequestOutput] = None + async for output in generator: + final_res = output + assert final_res is not None + + return final_res.outputs[0].token_ids + + async def _generate_step(self, prompt_ids: list[int], sampling_params: dict[str, Any], request_id: str): + max_tokens = self.max_model_len - len(prompt_ids) + sampling_params = SamplingParams(max_tokens=max_tokens, logprobs=1, **sampling_params) + prompt = TokensPrompt(prompt_token_ids=prompt_ids) + generator = self.engine.generate(prompt=prompt, sampling_params=sampling_params, request_id=request_id) + + # Get final response + self.req_output[request_id]: Optional[RequestOutput] = None + async for output in generator: + self.req_output[request_id] = output + assert self.req_output[request_id] is not None + + async def generate_for_partial( + self, prompt_ids: list[int], sampling_params: dict[str, Any], request_id: str + ) -> tuple[list[Any], list[Any], bool] | tuple[Sequence[int], list[float], Any]: + # 设置中断标志 + async with self.lock: + if self.paused: + # cancel 后, 所有任务直接返回,等待下次提交 + return [], [], True + self.cancel_event[request_id] = asyncio.Event() + cancel_handle = asyncio.create_task(self.cancel_event[request_id].wait()) + generation_handle = asyncio.create_task(self._generate_step(prompt_ids, sampling_params, request_id)) + + done, pend = await asyncio.wait([generation_handle, cancel_handle], return_when=asyncio.FIRST_COMPLETED) + + for task in done: + await task + + for task in pend: + task.cancel() + + async with self.lock: + token_ids = self.req_output[request_id].outputs[0].token_ids + log_probs: list[float] = [] + for i, x in enumerate(self.req_output[request_id].outputs[0].logprobs): + # sampling_params 中 logprobs 设置为1,应该返回1个, 但是实测会有多个,取token_id所对应的log_prob + token_id = self.req_output[request_id].outputs[0].token_ids[i] + log_probs.append(x[token_id].logprob) + is_cancel = generation_handle not in done + self.cancel_event.pop(request_id, None) + self.req_output.pop(request_id, None) + return token_ids, log_probs, is_cancel + + async def cancel(self): + async with self.lock: + self.paused = True + for request_id in self.cancel_event: + self.cancel_event[request_id].set() + + async def resume(self): + async with self.lock: + self.paused = False + + async def wake_up(self): + if self.config.rollout.free_cache_engine: + await self.engine.wake_up() + + async def sleep(self): + # TODO: https://github.com/vllm-project/vllm/issues/17103 + await self.engine.reset_prefix_cache() + if self.config.rollout.free_cache_engine: + await self.engine.sleep() + + diff --git a/recipe/fully_async_policy/config/fully_async_ppo_trainer.yaml b/recipe/fully_async_policy/config/fully_async_ppo_trainer.yaml index 0d6c05a9a5c..c2708b975be 100644 --- a/recipe/fully_async_policy/config/fully_async_ppo_trainer.yaml +++ b/recipe/fully_async_policy/config/fully_async_ppo_trainer.yaml @@ -33,10 +33,10 @@ rollout: # number of responses (i.e. num sample times). > 1 for grpo n: 4 - # Number of epochs in training + # total rollout samples # TODO rename to total_rollout_samples total_rollout_steps: 100 - # + # Number of epochs in training total_epochs: 10 # Test frequency, how many times a parameter update triggers a validation diff --git a/recipe/fully_async_policy/fsdp_workers.py b/recipe/fully_async_policy/fsdp_workers.py index dd941c26684..41fa3a55eec 100644 --- a/recipe/fully_async_policy/fsdp_workers.py +++ b/recipe/fully_async_policy/fsdp_workers.py @@ -51,19 +51,16 @@ def get_inference_model(rollout): """ - 根据不同类型的inference_engine获取模型对象 + get models according to different types of inference_engine Args: - rollout: rollout对象,包含inference_engine + rollout: rollout object Returns: model: 模型对象 """ inference_engine = rollout.inference_engine - # 判断inference_engine的类型 if hasattr(inference_engine, "llm_engine"): - # LLM类型 - vLLMRollout inference_model = inference_engine.llm_engine.model_executor.driver_worker.worker.model_runner.model elif hasattr(inference_engine, "worker"): - # WorkerWrapperBase类型 - vLLMAsyncRollout inference_model = inference_engine.worker.model_runner.model else: raise AttributeError( diff --git a/recipe/fully_async_policy/fully_async_main.py b/recipe/fully_async_policy/fully_async_main.py index a588679991c..699222f350a 100644 --- a/recipe/fully_async_policy/fully_async_main.py +++ b/recipe/fully_async_policy/fully_async_main.py @@ -24,7 +24,7 @@ from recipe.fully_async_policy.fully_async_rollouter import FullyAsyncRollouter from recipe.fully_async_policy.fully_async_trainer import FullyAsyncTrainer from recipe.fully_async_policy.message_queue import MessageQueue, MessageQueueClient -from verl.trainer.ppo.ray_trainer import ResourcePoolManager, Role +from recipe.fully_async_policy.ray_trainer import ResourcePoolManager, Role from verl.trainer.ppo.reward import load_reward_manager from verl.utils.fs import copy_to_local @@ -185,16 +185,16 @@ def _initialize_components(self, config) -> None: print("[ASYNC MAIN] Creating FullyAsyncTrainer...") self._create_trainer(config) - # 同步require samples + # sync require samples between rollouter and trainer required_samples = ray.get(self.components["trainer"].get_required_samples.remote()) ray.get(self.components["rollouter"].set_required_samples.remote(required_samples)) - # 同步total_train_steps + # sync total_train_steps between rollouter and trainer total_train_steps = ray.get(self.components["rollouter"].get_total_train_steps.remote()) print(f"total_train_steps {total_train_steps}") ray.get(self.components["trainer"].set_total_train_steps.remote(total_train_steps)) - # 获取 max_queue_size (使用同步方法避免异步返回值问题) + # max_queue_size max_queue_size = ray.get(self.components["rollouter"].get_max_queue_size.remote()) print(f"[ASYNC MAIN] Creating MessageQueue... max_queue_size {max_queue_size}") message_queue = MessageQueue.remote(config, max_queue_size) @@ -280,7 +280,7 @@ def _run_training_loop(self): @hydra.main(config_path="config", config_name="fully_async_ppo_trainer", version_base=None) def main(config): - from verl.trainer.main_ppo import run_ppo + from recipe.fully_async_policy.main_ppo import run_ppo # Ensure async training config exists if not hasattr(config, "async_training"): diff --git a/recipe/fully_async_policy/fully_async_rollouter.py b/recipe/fully_async_policy/fully_async_rollouter.py index 162836a00f6..919314ba1b5 100644 --- a/recipe/fully_async_policy/fully_async_rollouter.py +++ b/recipe/fully_async_policy/fully_async_rollouter.py @@ -25,7 +25,7 @@ ) from recipe.fully_async_policy.message_queue import MessageQueueClient from verl.single_controller.ray import RayClassWithInitArgs, RayWorkerGroup -from verl.trainer.ppo.ray_trainer import RayPPOTrainer, ResourcePoolManager, Role, WorkerType +from recipe.fully_async_policy.ray_trainer import RayPPOTrainer, ResourcePoolManager, Role, WorkerType from verl.utils.profiler import marked_timer from verl.utils.tracking import ValidationGenerationsLogger @@ -257,7 +257,7 @@ def _create_continuous_iterator(self): def _init_async_rollout_manager(self): # create async rollout manager and request scheduler assert self.config.actor_rollout_ref.rollout.mode == "async" - from verl.experimental.agent_loop import AgentLoopManager + from recipe.fully_async_policy.agent_loop import AgentLoopManager self.async_rollout_mode = True self.async_rollout_manager = AgentLoopManager( diff --git a/recipe/fully_async_policy/fully_async_trainer.py b/recipe/fully_async_policy/fully_async_trainer.py index 5d2a2c794e8..0c1501cbf89 100644 --- a/recipe/fully_async_policy/fully_async_trainer.py +++ b/recipe/fully_async_policy/fully_async_trainer.py @@ -31,7 +31,7 @@ from verl.single_controller.ray import RayClassWithInitArgs, RayWorkerGroup from verl.trainer.ppo import core_algos from verl.trainer.ppo.core_algos import AdvantageEstimator -from verl.trainer.ppo.ray_trainer import ( +from recipe.fully_async_policy.ray_trainer import ( RayPPOTrainer, ResourcePoolManager, Role, diff --git a/recipe/fully_async_policy/main_ppo.py b/recipe/fully_async_policy/main_ppo.py new file mode 100644 index 00000000000..4b240c6ffbf --- /dev/null +++ b/recipe/fully_async_policy/main_ppo.py @@ -0,0 +1,344 @@ +# Copyright 2024 Bytedance Ltd. and/or its affiliates +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" +Note that we don't combine the main with ray_trainer as ray_trainer is used by other main. +""" + +import os +import socket + +import hydra +import ray +from omegaconf import OmegaConf + +from verl.experimental.dataset.sampler import AbstractSampler +from verl.trainer.constants_ppo import get_ppo_ray_runtime_env +from verl.trainer.ppo.ray_trainer import RayPPOTrainer +from verl.trainer.ppo.reward import load_reward_manager +from verl.utils.device import is_cuda_available +from verl.utils.import_utils import load_extern_type + + +@hydra.main(config_path="config", config_name="ppo_trainer", version_base=None) +def main(config): + """Main entry point for PPO training with Hydra configuration management. + + Args: + config_dict: Hydra configuration dictionary containing training parameters. + """ + from time import time + + start_time = time() + run_ppo(config) + print(f"total time: {time() - start_time:.2f} seconds") + + +# Define a function to run the PPO-like training process +def run_ppo(config, task_runner_class=None) -> None: + """Initialize Ray cluster and run distributed PPO training process. + + Args: + config: Training configuration object containing all necessary parameters + for distributed PPO training including Ray initialization settings, + model paths, and training hyperparameters. + """ + # Check if Ray is not initialized + if not ray.is_initialized(): + # Initialize Ray with a local cluster configuration + # Set environment variables in the runtime environment to control tokenizer parallelism, + # NCCL debug level, VLLM logging level, and allow runtime LoRA updating + # `num_cpus` specifies the number of CPU cores Ray can use, obtained from the configuration + ray.init( + runtime_env=get_ppo_ray_runtime_env(), + num_cpus=config.ray_init.num_cpus, + ) + # for recipe to change TaskRunner + if task_runner_class is None: + task_runner_class = TaskRunner + + # Create a remote instance of the TaskRunner class, and + # Execute the `run` method of the TaskRunner instance remotely and wait for it to complete + if ( + is_cuda_available + and config.trainer.get("profile_steps") is not None + and len(config.trainer.get("profile_steps", [])) > 0 + ): + nsight_options = OmegaConf.to_container(config.trainer.controller_nsight_options) + runner = task_runner_class.options(runtime_env={"nsight": nsight_options}).remote() + else: + runner = task_runner_class.remote() + ray.get(runner.run.remote(config)) + + # [Optional] get the path of the timeline trace file from the configuration, default to None + # This file is used for performance analysis + timeline_json_file = config.ray_init.get("timeline_json_file", None) + if timeline_json_file: + ray.timeline(filename=timeline_json_file) + + +@ray.remote(num_cpus=1) # please make sure main_task is not scheduled on head +class TaskRunner: + """Ray remote class for executing distributed PPO training tasks. + + This class encapsulates the main training logic and runs as a Ray remote actor + to enable distributed execution across multiple nodes and GPUs. + """ + + def run(self, config): + """Execute the main PPO training workflow. + + This method sets up the distributed training environment, initializes + workers, datasets, and reward functions, then starts the training process. + + Args: + config: Training configuration object containing all parameters needed + for setting up and running the PPO training process. + """ + # Print the initial configuration. `resolve=True` will evaluate symbolic values. + from pprint import pprint + + from omegaconf import OmegaConf + + from verl.utils.fs import copy_to_local + + print(f"TaskRunner hostname: {socket.gethostname()}, PID: {os.getpid()}") + pprint(OmegaConf.to_container(config, resolve=True)) + OmegaConf.resolve(config) + + # Download the checkpoint from HDFS to the local machine. + # `use_shm` determines whether to use shared memory, which could lead to faster model loading if turned on + local_path = copy_to_local( + config.actor_rollout_ref.model.path, use_shm=config.actor_rollout_ref.model.get("use_shm", False) + ) + + # Instantiate the tokenizer and processor. + from verl.utils import hf_processor, hf_tokenizer + + trust_remote_code = config.data.get("trust_remote_code", False) + tokenizer = hf_tokenizer(local_path, trust_remote_code=trust_remote_code) + # Used for multimodal LLM, could be None + processor = hf_processor(local_path, trust_remote_code=trust_remote_code, use_fast=True) + + # Define worker classes based on the actor strategy. + if config.actor_rollout_ref.actor.strategy in {"fsdp", "fsdp2"}: + assert config.critic.strategy in {"fsdp", "fsdp2"} + from verl.single_controller.ray import RayWorkerGroup + from verl.workers.fsdp_workers import ActorRolloutRefWorker, AsyncActorRolloutRefWorker + + use_legacy_worker_impl = config.trainer.get("use_legacy_worker_impl", "auto") + if use_legacy_worker_impl in ["auto", "enable"]: + # import warnings + # warnings.warn(f"Legacy worker impl is going to be deprecated, will be removed in the future. \ + # Please set trainer.use_legacy_worker_impl = false to switch to the new worker implementation.") + from verl.workers.fsdp_workers import CriticWorker + elif use_legacy_worker_impl == "disable": + from verl.workers.roles import CriticWorker + + print("Using new worker implementation") + else: + raise ValueError(f"Invalid use_legacy_worker_impl: {use_legacy_worker_impl}") + + actor_rollout_cls = ( + AsyncActorRolloutRefWorker + if config.actor_rollout_ref.rollout.mode == "async" + else ActorRolloutRefWorker + ) + ray_worker_group_cls = RayWorkerGroup + + elif config.actor_rollout_ref.actor.strategy == "megatron": + assert config.actor_rollout_ref.actor.strategy == config.critic.strategy + from verl.single_controller.ray.megatron import NVMegatronRayWorkerGroup + from verl.workers.megatron_workers import ActorRolloutRefWorker, AsyncActorRolloutRefWorker, CriticWorker + + actor_rollout_cls = ( + AsyncActorRolloutRefWorker + if config.actor_rollout_ref.rollout.mode == "async" + else ActorRolloutRefWorker + ) + ray_worker_group_cls = NVMegatronRayWorkerGroup + + else: + raise NotImplementedError + + from verl.trainer.ppo.ray_trainer import ResourcePoolManager, Role + + # Map roles to their corresponding remote worker classes. + role_worker_mapping = { + Role.ActorRollout: ray.remote(actor_rollout_cls), + Role.Critic: ray.remote(CriticWorker), + } + + # Define the resource pool specification. + # Map roles to the resource pool. + global_pool_id = "global_pool" + resource_pool_spec = { + global_pool_id: [config.trainer.n_gpus_per_node] * config.trainer.nnodes, + } + mapping = { + Role.ActorRollout: global_pool_id, + Role.Critic: global_pool_id, + } + + # We should adopt a multi-source reward function here: + # - for rule-based rm, we directly call a reward score + # - for model-based rm, we call a model + # - for code related prompt, we send to a sandbox if there are test cases + # finally, we combine all the rewards together + # The reward type depends on the tag of the data + if config.reward_model.enable: + if config.reward_model.strategy in {"fsdp", "fsdp2"}: + from verl.workers.fsdp_workers import RewardModelWorker + elif config.reward_model.strategy == "megatron": + from verl.workers.megatron_workers import RewardModelWorker + else: + raise NotImplementedError + role_worker_mapping[Role.RewardModel] = ray.remote(RewardModelWorker) + mapping[Role.RewardModel] = global_pool_id + + # Add a reference policy worker if KL loss or KL reward is used. + if config.algorithm.use_kl_in_reward or config.actor_rollout_ref.actor.use_kl_loss: + role_worker_mapping[Role.RefPolicy] = ray.remote(ActorRolloutRefWorker) + mapping[Role.RefPolicy] = global_pool_id + + # Load the reward manager for training and validation. + reward_fn = load_reward_manager( + config, tokenizer, num_examine=0, **config.reward_model.get("reward_kwargs", {}) + ) + val_reward_fn = load_reward_manager( + config, tokenizer, num_examine=1, **config.reward_model.get("reward_kwargs", {}) + ) + resource_pool_manager = ResourcePoolManager(resource_pool_spec=resource_pool_spec, mapping=mapping) + + from verl.utils.dataset.rl_dataset import collate_fn + + # Create training and validation datasets. + train_dataset = create_rl_dataset(config.data.train_files, config.data, tokenizer, processor, is_train=True) + val_dataset = create_rl_dataset(config.data.val_files, config.data, tokenizer, processor, is_train=False) + train_sampler = create_rl_sampler(config.data, train_dataset) + + # Initialize the PPO trainer. + trainer = RayPPOTrainer( + config=config, + tokenizer=tokenizer, + processor=processor, + role_worker_mapping=role_worker_mapping, + resource_pool_manager=resource_pool_manager, + ray_worker_group_cls=ray_worker_group_cls, + reward_fn=reward_fn, + val_reward_fn=val_reward_fn, + train_dataset=train_dataset, + val_dataset=val_dataset, + collate_fn=collate_fn, + train_sampler=train_sampler, + ) + # Initialize the workers of the trainer. + trainer.init_workers() + # Start the training process. + trainer.fit() + + +def create_rl_dataset(data_paths, data_config, tokenizer, processor, is_train=True): + """Create a dataset. + + Arguments: + data_paths: List of paths to data files. + data_config: The data config. + tokenizer (Tokenizer): The tokenizer. + processor (Processor): The processor. + + Returns: + dataset (Dataset): The dataset. + """ + from torch.utils.data import Dataset + + from verl.utils.dataset.rl_dataset import RLHFDataset + + # Check if a custom dataset class is specified in the data configuration + # and if the path to the custom class is provided + if "custom_cls" in data_config and data_config.custom_cls.get("path", None) is not None: + # Dynamically load the custom dataset class + dataset_cls = load_extern_type(data_config.custom_cls.path, data_config.custom_cls.name) + # Verify that the custom dataset class inherits from torch.utils.data.Dataset + if not issubclass(dataset_cls, Dataset): + raise TypeError( + f"The custom dataset class '{data_config.custom_cls.name}' from " + f"'{data_config.custom_cls.path}' must inherit from torch.utils.data.Dataset" + ) + elif "datagen" in data_config and data_config.datagen.get("path", None) is not None and is_train: + # If a data generation strategy is specified, use the DynamicGenDataset class + from verl.utils.dataset.dynamicgen_dataset import DynamicGenDataset + + dataset_cls = DynamicGenDataset + print("Using DynamicGenDataset for data generation.") + + else: + # Use the default RLHFDataset class if no custom class is specified + dataset_cls = RLHFDataset + print(f"Using dataset class: {dataset_cls.__name__}") + + # Instantiate the dataset using the determined dataset class + dataset = dataset_cls( + data_files=data_paths, + tokenizer=tokenizer, + processor=processor, + config=data_config, + ) + + return dataset + + +def create_rl_sampler(data_config, dataset): + """Create a sampler for the dataset. + + Arguments: + data_config: The data config. + dataset (Dataset): The dataset. + + Returns: + sampler (Sampler): The sampler. + """ + import torch + from torch.utils.data import RandomSampler, SequentialSampler + + if data_config.sampler is not None and data_config.sampler.get("class_path", None) is not None: + curriculum_class = load_extern_type( + data_config.sampler.class_path, + data_config.sampler.class_name, + ) + sampler = curriculum_class( + data_source=dataset, + data_config=data_config, + ) + assert isinstance(sampler, AbstractSampler) + assert data_config.get("dataloader_num_workers", 8) == 0, ( + "If using curriculum, num_workers must be 0 to prevent data caching. " + "If the dataloader caches data before the batch is done the " + "curriculum sampler won't have the opportunity to reorder it. " + ) + + # Use a sampler to facilitate checkpoint resumption. + # If shuffling is enabled in the data configuration, create a random sampler. + elif data_config.shuffle: + train_dataloader_generator = torch.Generator() + train_dataloader_generator.manual_seed(data_config.get("seed", 1)) + sampler = RandomSampler(data_source=dataset, generator=train_dataloader_generator) + else: + # If shuffling is disabled, use a sequential sampler to iterate through the dataset in order. + sampler = SequentialSampler(data_source=dataset) + + return sampler + + +if __name__ == "__main__": + main() diff --git a/recipe/fully_async_policy/ray_trainer.py b/recipe/fully_async_policy/ray_trainer.py new file mode 100644 index 00000000000..56a1e5bcab1 --- /dev/null +++ b/recipe/fully_async_policy/ray_trainer.py @@ -0,0 +1,1434 @@ +# Copyright 2024 Bytedance Ltd. and/or its affiliates +# Copyright 2023-2024 SGLang Team +# Copyright 2025 ModelBest Inc. and/or its affiliates +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" +PPO Trainer with Ray-based single controller. +This trainer supports model-agonistic model initialization with huggingface +""" + +import json +import os +import uuid +import warnings +from collections import defaultdict +from copy import deepcopy +from dataclasses import dataclass, field +from enum import Enum +from pprint import pprint +from typing import Optional + +import numpy as np +import ray +import torch +from omegaconf import OmegaConf, open_dict +from torch.utils.data import Dataset, Sampler +from torchdata.stateful_dataloader import StatefulDataLoader +from tqdm import tqdm + +from verl import DataProto +from verl.experimental.dataset.sampler import AbstractCurriculumSampler +from verl.protocol import pad_dataproto_to_divisor, unpad_dataproto +from verl.single_controller.base import Worker +from verl.single_controller.ray import RayClassWithInitArgs, RayResourcePool, RayWorkerGroup +from verl.single_controller.ray.base import create_colocated_worker_cls +from verl.trainer.config import AlgoConfig +from verl.trainer.ppo import core_algos +from verl.trainer.ppo.core_algos import AdvantageEstimator, agg_loss +from verl.trainer.ppo.metric_utils import ( + compute_data_metrics, + compute_throughout_metrics, + compute_timing_metrics, + process_validation_metrics, +) +from verl.trainer.ppo.reward import compute_reward, compute_reward_async +from verl.utils.checkpoint.checkpoint_manager import find_latest_ckpt_path, should_save_ckpt_esi +from verl.utils.config import omega_conf_to_dataclass +from verl.utils.debug import marked_timer +from verl.utils.metric import ( + reduce_metrics, +) +from verl.utils.seqlen_balancing import get_seqlen_balanced_partitions, log_seqlen_unbalance +from verl.utils.torch_functional import masked_mean +from verl.utils.tracking import ValidationGenerationsLogger + +WorkerType = type[Worker] + + +class Role(Enum): + """ + To create more roles dynamically, you can subclass Role and add new members + """ + + Actor = 0 + Rollout = 1 + ActorRollout = 2 + Critic = 3 + RefPolicy = 4 + RewardModel = 5 + ActorRolloutRef = 6 + + def __str__(self): + """返回与代码中一致的字符串表示""" + return self._get_role_string() + + def _get_role_string(self): + """获取角色对应的字符串名称""" + role_mapping = { + Role.Actor: "actor", + Role.Rollout: "rollout", + Role.ActorRollout: "actor_rollout", + Role.Critic: "critic", + Role.RefPolicy: "ref", + Role.RewardModel: "rm", + Role.ActorRolloutRef: "actor_rollout_ref", + } + return role_mapping.get(self, self.name.lower()) + + @classmethod + def from_string(cls, name: str): + """从字符串创建Role实例""" + string_mapping = { + "actor": cls.Actor, + "rollout": cls.Rollout, + "actor_rollout": cls.ActorRollout, + "critic": cls.Critic, + "ref": cls.RefPolicy, + "rm": cls.RewardModel, + "actor_rollout_ref": cls.ActorRolloutRef, + } + role = string_mapping.get(name.lower()) + if role is None: + raise ValueError(f"No Role found for string: {name}") + return role + + +@dataclass +class ResourcePoolManager: + """ + Define a resource pool specification. Resource pool will be initialized first. + """ + + resource_pool_spec: dict[str, list[int]] + mapping: dict[Role, str] + resource_pool_dict: dict[str, RayResourcePool] = field(default_factory=dict) + + def create_resource_pool(self): + """Create Ray resource pools for distributed training. + + Initializes resource pools based on the resource pool specification, + with each pool managing GPU resources across multiple nodes. + For FSDP backend, uses max_colocate_count=1 to merge WorkerGroups. + For Megatron backend, uses max_colocate_count>1 for different models. + """ + for resource_pool_name, process_on_nodes in self.resource_pool_spec.items(): + # max_colocate_count means the number of WorkerGroups (i.e. processes) in each RayResourcePool + # For FSDP backend, we recommend using max_colocate_count=1 that merge all WorkerGroups into one. + # For Megatron backend, we recommend using max_colocate_count>1 + # that can utilize different WorkerGroup for differnt models + resource_pool = RayResourcePool( + process_on_nodes=process_on_nodes, use_gpu=True, max_colocate_count=1, name_prefix=resource_pool_name + ) + self.resource_pool_dict[resource_pool_name] = resource_pool + + self._check_resource_available() + + def get_resource_pool(self, role: Role) -> RayResourcePool: + """Get the resource pool of the worker_cls""" + return self.resource_pool_dict[self.mapping[role]] + + def get_n_gpus(self) -> int: + """Get the number of gpus in this cluster.""" + return sum([n_gpus for process_on_nodes in self.resource_pool_spec.values() for n_gpus in process_on_nodes]) + + def _check_resource_available(self): + """Check if the resource pool can be satisfied in this ray cluster.""" + node_available_resources = ray.state.available_resources_per_node() + node_available_gpus = { + node: node_info.get("GPU", 0) if "GPU" in node_info else node_info.get("NPU", 0) + for node, node_info in node_available_resources.items() + } + + # check total required gpus can be satisfied + total_available_gpus = sum(node_available_gpus.values()) + total_required_gpus = sum( + [n_gpus for process_on_nodes in self.resource_pool_spec.values() for n_gpus in process_on_nodes] + ) + if total_available_gpus < total_required_gpus: + raise ValueError( + f"Total available GPUs {total_available_gpus} is less than total desired GPUs {total_required_gpus}" + ) + + # check each resource pool can be satisfied, O(#resource_pools * #nodes) + for resource_pool_name, process_on_nodes in self.resource_pool_spec.items(): + num_gpus, num_nodes = process_on_nodes[0], len(process_on_nodes) + for node, available_gpus in node_available_gpus.items(): + if available_gpus >= num_gpus: + node_available_gpus[node] -= num_gpus + num_nodes -= 1 + if num_nodes == 0: + break + if num_nodes > 0: + raise ValueError( + f"Resource pool {resource_pool_name}: {num_gpus}*{num_nodes}" + + "cannot be satisfied in this ray cluster" + ) + + +def apply_kl_penalty(data: DataProto, kl_ctrl: core_algos.AdaptiveKLController, kl_penalty="kl"): + """Apply KL penalty to the token-level rewards. + + This function computes the KL divergence between the reference policy and current policy, + then applies a penalty to the token-level rewards based on this divergence. + + Args: + data (DataProto): The data containing batched model outputs and inputs. + kl_ctrl (core_algos.AdaptiveKLController): Controller for adaptive KL penalty. + kl_penalty (str, optional): Type of KL penalty to apply. Defaults to "kl". + multi_turn (bool, optional): Whether the data is from a multi-turn conversation. Defaults to False. + + Returns: + tuple: A tuple containing: + - The updated data with token-level rewards adjusted by KL penalty + - A dictionary of metrics related to the KL penalty + """ + response_mask = data.batch["response_mask"] + token_level_scores = data.batch["token_level_scores"] + batch_size = data.batch.batch_size[0] + + # compute kl between ref_policy and current policy + # When apply_kl_penalty, algorithm.use_kl_in_reward=True, so the reference model has been enabled. + kld = core_algos.kl_penalty( + data.batch["old_log_probs"], data.batch["ref_log_prob"], kl_penalty=kl_penalty + ) # (batch_size, response_length) + kld = kld * response_mask + beta = kl_ctrl.value + + token_level_rewards = token_level_scores - beta * kld + + current_kl = masked_mean(kld, mask=response_mask, axis=-1) # average over sequence + current_kl = torch.mean(current_kl, dim=0).item() + + # according to https://github.com/huggingface/trl/blob/951ca1841f29114b969b57b26c7d3e80a39f75a0/trl/trainer/ppo_trainer.py#L837 + kl_ctrl.update(current_kl=current_kl, n_steps=batch_size) + data.batch["token_level_rewards"] = token_level_rewards + + metrics = {"actor/reward_kl_penalty": current_kl, "actor/reward_kl_penalty_coeff": beta} + + return data, metrics + + +def compute_response_mask(data: DataProto): + """Compute the attention mask for the response part of the sequence. + + This function extracts the portion of the attention mask that corresponds to the model's response, + which is used for masking computations that should only apply to response tokens. + + Args: + data (DataProto): The data containing batched model outputs and inputs. + + Returns: + torch.Tensor: The attention mask for the response tokens. + """ + responses = data.batch["responses"] + response_length = responses.size(1) + attention_mask = data.batch["attention_mask"] + return attention_mask[:, -response_length:] + + +def compute_advantage( + data: DataProto, + adv_estimator: AdvantageEstimator, + gamma: float = 1.0, + lam: float = 1.0, + num_repeat: int = 1, + norm_adv_by_std_in_grpo: bool = True, + config: Optional[AlgoConfig] = None, +) -> DataProto: + """Compute advantage estimates for policy optimization. + + This function computes advantage estimates using various estimators like GAE, GRPO, REINFORCE++, etc. + The advantage estimates are used to guide policy optimization in RL algorithms. + + Args: + data (DataProto): The data containing batched model outputs and inputs. + adv_estimator (AdvantageEstimator): The advantage estimator to use (e.g., GAE, GRPO, REINFORCE++). + gamma (float, optional): Discount factor for future rewards. Defaults to 1.0. + lam (float, optional): Lambda parameter for GAE. Defaults to 1.0. + num_repeat (int, optional): Number of times to repeat the computation. Defaults to 1. + norm_adv_by_std_in_grpo (bool, optional): Whether to normalize advantages by standard deviation in + GRPO. Defaults to True. + config (dict, optional): Configuration dictionary for algorithm settings. Defaults to None. + + Returns: + DataProto: The updated data with computed advantages and returns. + """ + # Back-compatible with trainers that do not compute response mask in fit + if "response_mask" not in data.batch.keys(): + data.batch["response_mask"] = compute_response_mask(data) + # prepare response group + if adv_estimator == AdvantageEstimator.GAE: + # Compute advantages and returns using Generalized Advantage Estimation (GAE) + advantages, returns = core_algos.compute_gae_advantage_return( + token_level_rewards=data.batch["token_level_rewards"], + values=data.batch["values"], + response_mask=data.batch["response_mask"], + gamma=gamma, + lam=lam, + ) + data.batch["advantages"] = advantages + data.batch["returns"] = returns + if config.get("use_pf_ppo", False): + data = core_algos.compute_pf_ppo_reweight_data( + data, + config.pf_ppo.get("reweight_method"), + config.pf_ppo.get("weight_pow"), + ) + elif adv_estimator == AdvantageEstimator.GRPO: + # Initialize the mask for GRPO calculation + grpo_calculation_mask = data.batch["response_mask"] + # Call compute_grpo_outcome_advantage with parameters matching its definition + advantages, returns = core_algos.compute_grpo_outcome_advantage( + token_level_rewards=data.batch["token_level_rewards"], + response_mask=grpo_calculation_mask, + index=data.non_tensor_batch["uid"], + norm_adv_by_std_in_grpo=norm_adv_by_std_in_grpo, + ) + data.batch["advantages"] = advantages + data.batch["returns"] = returns + else: + # handle all other adv estimator type other than GAE and GRPO + adv_estimator_fn = core_algos.get_adv_estimator_fn(adv_estimator) + adv_kwargs = { + "token_level_rewards": data.batch["token_level_rewards"], + "response_mask": data.batch["response_mask"], + "config": config, + } + if "uid" in data.non_tensor_batch: # optional + adv_kwargs["index"] = data.non_tensor_batch["uid"] + if "reward_baselines" in data.batch: # optional + adv_kwargs["reward_baselines"] = data.batch["reward_baselines"] + + # calculate advantage estimator + advantages, returns = adv_estimator_fn(**adv_kwargs) + data.batch["advantages"] = advantages + data.batch["returns"] = returns + return data + + +class RayPPOTrainer: + """Distributed PPO trainer using Ray for scalable reinforcement learning. + + This trainer orchestrates distributed PPO training across multiple nodes and GPUs, + managing actor rollouts, critic training, and reward computation with Ray backend. + Supports various model architectures including FSDP, Megatron, and vLLM integration. + """ + + # TODO: support each role have individual ray_worker_group_cls, + # i.e., support different backend of different role + def __init__( + self, + config, + tokenizer, + role_worker_mapping: dict[Role, WorkerType], + resource_pool_manager: ResourcePoolManager, + ray_worker_group_cls: RayWorkerGroup = RayWorkerGroup, + processor=None, + reward_fn=None, + val_reward_fn=None, + train_dataset: Optional[Dataset] = None, + val_dataset: Optional[Dataset] = None, + collate_fn=None, + train_sampler: Optional[Sampler] = None, + device_name=None, + ): + """ + Initialize distributed PPO trainer with Ray backend. + Note that this trainer runs on the driver process on a single CPU/GPU node. + + Args: + config: Configuration object containing training parameters. + tokenizer: Tokenizer used for encoding and decoding text. + role_worker_mapping (dict[Role, WorkerType]): Mapping from roles to worker classes. + resource_pool_manager (ResourcePoolManager): Manager for Ray resource pools. + ray_worker_group_cls (RayWorkerGroup, optional): Class for Ray worker groups. Defaults to RayWorkerGroup. + processor: Optional data processor, used for multimodal data + reward_fn: Function for computing rewards during training. + val_reward_fn: Function for computing rewards during validation. + train_dataset (Optional[Dataset], optional): Training dataset. Defaults to None. + val_dataset (Optional[Dataset], optional): Validation dataset. Defaults to None. + collate_fn: Function to collate data samples into batches. + train_sampler (Optional[Sampler], optional): Sampler for the training dataset. Defaults to None. + device_name (str, optional): Device name for training (e.g., "cuda", "cpu"). Defaults to None. + """ + + # Store the tokenizer for text processing + self.tokenizer = tokenizer + self.processor = processor + self.config = config + self.reward_fn = reward_fn + self.val_reward_fn = val_reward_fn + + self.hybrid_engine = config.actor_rollout_ref.hybrid_engine + assert self.hybrid_engine, "Currently, only support hybrid engine" + + if self.hybrid_engine: + assert Role.ActorRollout in role_worker_mapping, f"{role_worker_mapping.keys()=}" + + self.role_worker_mapping = role_worker_mapping + self.resource_pool_manager = resource_pool_manager + self.use_reference_policy = Role.RefPolicy in role_worker_mapping + self.use_rm = Role.RewardModel in role_worker_mapping + self.ray_worker_group_cls = ray_worker_group_cls + self.device_name = device_name if device_name else self.config.trainer.device + self.validation_generations_logger = ValidationGenerationsLogger( + project_name=self.config.trainer.project_name, + experiment_name=self.config.trainer.experiment_name, + ) + + # if ref_in_actor is True, the reference policy will be actor without lora applied + self.ref_in_actor = config.actor_rollout_ref.model.get("lora_rank", 0) > 0 + + # define in-reward KL control + # kl loss control currently not suppoorted + if self.config.algorithm.use_kl_in_reward: + self.kl_ctrl_in_reward = core_algos.get_kl_controller(self.config.algorithm.kl_ctrl) + + if config.critic.enable is not None: + self.use_critic = bool(config.critic.enable) + elif self.config.algorithm.adv_estimator == AdvantageEstimator.GAE: + self.use_critic = True + else: + warnings.warn( + "Disabled critic as algorithm.adv_estimator != gae. " + "If it is not intended, please set critic.enable=True", + stacklevel=2, + ) + self.use_critic = False + + self._validate_config() + self._create_dataloader(train_dataset, val_dataset, collate_fn, train_sampler) + + def _validate_config(self): + config = self.config + # number of GPUs total + n_gpus = config.trainer.n_gpus_per_node * config.trainer.nnodes + if config.actor_rollout_ref.actor.strategy == "megatron": + model_parallel_size = ( + config.actor_rollout_ref.actor.megatron.tensor_model_parallel_size + * config.actor_rollout_ref.actor.megatron.pipeline_model_parallel_size + ) + assert ( + n_gpus % (model_parallel_size * config.actor_rollout_ref.actor.megatron.context_parallel_size) == 0 + ), ( + f"n_gpus ({n_gpus}) must be divisible by model_parallel_size ({model_parallel_size}) times " + f"context_parallel_size ({config.actor_rollout_ref.actor.megatron.context_parallel_size})" + ) + megatron_dp = n_gpus // ( + model_parallel_size * config.actor_rollout_ref.actor.megatron.context_parallel_size + ) + self.minimal_bsz = megatron_dp * config.actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu + else: + self.minimal_bsz = n_gpus + + # 1. Check total batch size for data correctness + real_train_batch_size = config.data.train_batch_size * config.actor_rollout_ref.rollout.n + assert real_train_batch_size % self.minimal_bsz == 0, ( + f"real_train_batch_size ({real_train_batch_size}) must be divisible by minimal possible batch size " + f"({self.minimal_bsz})" + ) + + # A helper function to check "micro_batch_size" vs "micro_batch_size_per_gpu" + # We throw an error if the user sets both. The new convention is "..._micro_batch_size_per_gpu". + def check_mutually_exclusive(mbs, mbs_per_gpu, name: str): + """Validate mutually exclusive micro batch size configuration options. + + Ensures that users don't set both deprecated micro_batch_size and + the new micro_batch_size_per_gpu parameters simultaneously. + + Args: + mbs: Deprecated micro batch size parameter value. + mbs_per_gpu: New micro batch size per GPU parameter value. + name (str): Configuration section name for error messages. + + Raises: + ValueError: If both parameters are set or neither is set. + """ + settings = { + "reward_model": "micro_batch_size", + "actor_rollout_ref.ref": "log_prob_micro_batch_size", + "actor_rollout_ref.rollout": "log_prob_micro_batch_size", + } + + if name in settings: + param = settings[name] + param_per_gpu = f"{param}_per_gpu" + + if mbs is None and mbs_per_gpu is None: + raise ValueError( + f"[{name}] Please set at least one of '{name}.{param}' or '{name}.{param_per_gpu}'." + ) + + if mbs is not None and mbs_per_gpu is not None: + raise ValueError( + f"[{name}] You have set both '{name}.{param}' AND '{name}.{param_per_gpu}'. Please remove " + f"'{name}.{param}' because only '*_{param_per_gpu}' is supported (the former is deprecated)." + ) + + # Actor validation done in ActorConfig.__post_init__ and validate() + actor_config = omega_conf_to_dataclass(config.actor_rollout_ref.actor) + actor_config.validate(n_gpus, config.data.train_batch_size, config.actor_rollout_ref.model) + + if not config.actor_rollout_ref.actor.use_dynamic_bsz: + if self.use_reference_policy: + # reference: log_prob_micro_batch_size vs. log_prob_micro_batch_size_per_gpu + check_mutually_exclusive( + config.actor_rollout_ref.ref.log_prob_micro_batch_size, + config.actor_rollout_ref.ref.log_prob_micro_batch_size_per_gpu, + "actor_rollout_ref.ref", + ) + + # The rollout section also has log_prob_micro_batch_size vs. log_prob_micro_batch_size_per_gpu + check_mutually_exclusive( + config.actor_rollout_ref.rollout.log_prob_micro_batch_size, + config.actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu, + "actor_rollout_ref.rollout", + ) + + # Check for reward model micro-batch size conflicts + if config.reward_model.enable and not config.reward_model.use_dynamic_bsz: + check_mutually_exclusive( + config.reward_model.micro_batch_size, config.reward_model.micro_batch_size_per_gpu, "reward_model" + ) + + if self.config.algorithm.use_kl_in_reward and config.actor_rollout_ref.actor.use_kl_loss: + print("NOTICE: You have both enabled in-reward kl and kl loss.") + + # critic + if self.use_critic: + critic_config = omega_conf_to_dataclass(config.critic) + critic_config.validate(n_gpus, config.data.train_batch_size) + + if config.data.get("val_batch_size", None) is not None: + print( + "WARNING: val_batch_size is deprecated." + + " Validation datasets are sent to inference engines as a whole batch," + + " which will schedule the memory themselves." + ) + + # check eval config + if config.actor_rollout_ref.rollout.val_kwargs.do_sample: + assert config.actor_rollout_ref.rollout.temperature > 0, ( + "validation gen temperature should be greater than 0 when enabling do_sample" + ) + + print("[validate_config] All configuration checks passed successfully!") + + def _create_dataloader(self, train_dataset, val_dataset, collate_fn, train_sampler: Optional[Sampler]): + """ + Creates the train and validation dataloaders. + """ + # TODO: we have to make sure the batch size is divisible by the dp size + from verl.trainer.main_ppo import create_rl_dataset, create_rl_sampler + + if train_dataset is None: + train_dataset = create_rl_dataset( + self.config.data.train_files, self.config.data, self.tokenizer, self.processor + ) + if val_dataset is None: + val_dataset = create_rl_dataset( + self.config.data.val_files, self.config.data, self.tokenizer, self.processor + ) + self.train_dataset, self.val_dataset = train_dataset, val_dataset + + if train_sampler is None: + train_sampler = create_rl_sampler(self.config.data, self.train_dataset) + if collate_fn is None: + from verl.utils.dataset.rl_dataset import collate_fn as default_collate_fn + + collate_fn = default_collate_fn + + num_workers = self.config.data["dataloader_num_workers"] + + self.train_dataloader = StatefulDataLoader( + dataset=self.train_dataset, + batch_size=self.config.data.get("gen_batch_size", self.config.data.train_batch_size), + num_workers=num_workers, + drop_last=True, + collate_fn=collate_fn, + sampler=train_sampler, + ) + + val_batch_size = self.config.data.val_batch_size # Prefer config value if set + if val_batch_size is None: + val_batch_size = len(self.val_dataset) + + self.val_dataloader = StatefulDataLoader( + dataset=self.val_dataset, + batch_size=val_batch_size, + num_workers=num_workers, + shuffle=self.config.data.get("validation_shuffle", True), + drop_last=False, + collate_fn=collate_fn, + ) + + assert len(self.train_dataloader) >= 1, "Train dataloader is empty!" + assert len(self.val_dataloader) >= 1, "Validation dataloader is empty!" + + print( + f"Size of train dataloader: {len(self.train_dataloader)}, Size of val dataloader: " + f"{len(self.val_dataloader)}" + ) + + total_training_steps = len(self.train_dataloader) * self.config.trainer.total_epochs + + if self.config.trainer.total_training_steps is not None: + total_training_steps = self.config.trainer.total_training_steps + + self.total_training_steps = total_training_steps + print(f"Total training steps: {self.total_training_steps}") + + try: + OmegaConf.set_struct(self.config, True) + with open_dict(self.config): + if OmegaConf.select(self.config, "actor_rollout_ref.actor.optim"): + self.config.actor_rollout_ref.actor.optim.total_training_steps = total_training_steps + if OmegaConf.select(self.config, "critic.optim"): + self.config.critic.optim.total_training_steps = total_training_steps + except Exception as e: + print(f"Warning: Could not set total_training_steps in config. Structure missing? Error: {e}") + + def _dump_generations(self, inputs, outputs, scores, reward_extra_infos_dict, dump_path): + """Dump rollout/validation samples as JSONL.""" + os.makedirs(dump_path, exist_ok=True) + filename = os.path.join(dump_path, f"{self.global_steps}.jsonl") + + n = len(inputs) + base_data = { + "input": inputs, + "output": outputs, + "score": scores, + "step": [self.global_steps] * n, + } + + for k, v in reward_extra_infos_dict.items(): + if len(v) == n: + base_data[k] = v + + lines = [] + for i in range(n): + entry = {k: v[i] for k, v in base_data.items()} + lines.append(json.dumps(entry, ensure_ascii=False)) + + with open(filename, "w") as f: + f.write("\n".join(lines) + "\n") + + print(f"Dumped generations to {filename}") + + def _maybe_log_val_generations(self, inputs, outputs, scores): + """Log a table of validation samples to the configured logger (wandb or swanlab)""" + + generations_to_log = self.config.trainer.log_val_generations + + if generations_to_log == 0: + return + + import numpy as np + + # Create tuples of (input, output, score) and sort by input text + samples = list(zip(inputs, outputs, scores, strict=True)) + samples.sort(key=lambda x: x[0]) # Sort by input text + + # Use fixed random seed for deterministic shuffling + rng = np.random.RandomState(42) + rng.shuffle(samples) + + # Take first N samples after shuffling + samples = samples[:generations_to_log] + + # Log to each configured logger + self.validation_generations_logger.log(self.config.trainer.logger, samples, self.global_steps) + + def _validate(self): + data_source_lst = [] + reward_extra_infos_dict: dict[str, list] = defaultdict(list) + + # Lists to collect samples for the table + sample_inputs = [] + sample_outputs = [] + sample_scores = [] + sample_turns = [] + + for test_data in self.val_dataloader: + test_batch = DataProto.from_single_dict(test_data) + + # repeat test batch + test_batch = test_batch.repeat( + repeat_times=self.config.actor_rollout_ref.rollout.val_kwargs.n, interleave=True + ) + + # we only do validation on rule-based rm + if self.config.reward_model.enable and test_batch[0].non_tensor_batch["reward_model"]["style"] == "model": + return {} + + # Store original inputs + input_ids = test_batch.batch["input_ids"] + # TODO: Can we keep special tokens except for padding tokens? + input_texts = [self.tokenizer.decode(ids, skip_special_tokens=True) for ids in input_ids] + sample_inputs.extend(input_texts) + + batch_keys_to_pop = ["input_ids", "attention_mask", "position_ids"] + non_tensor_batch_keys_to_pop = ["raw_prompt_ids"] + if "multi_modal_data" in test_batch.non_tensor_batch: + non_tensor_batch_keys_to_pop.append("multi_modal_data") + if "raw_prompt" in test_batch.non_tensor_batch: + non_tensor_batch_keys_to_pop.append("raw_prompt") + if "tools_kwargs" in test_batch.non_tensor_batch: + non_tensor_batch_keys_to_pop.append("tools_kwargs") + if "interaction_kwargs" in test_batch.non_tensor_batch: + non_tensor_batch_keys_to_pop.append("interaction_kwargs") + if "agent_name" in test_batch.non_tensor_batch: + non_tensor_batch_keys_to_pop.append("agent_name") + test_gen_batch = test_batch.pop( + batch_keys=batch_keys_to_pop, + non_tensor_batch_keys=non_tensor_batch_keys_to_pop, + ) + + test_gen_batch.meta_info = { + "eos_token_id": self.tokenizer.eos_token_id, + "pad_token_id": self.tokenizer.pad_token_id, + "recompute_log_prob": False, + "do_sample": self.config.actor_rollout_ref.rollout.val_kwargs.do_sample, + "validate": True, + "global_steps": self.global_steps, + } + print(f"test_gen_batch meta info: {test_gen_batch.meta_info}") + + # pad to be divisible by dp_size + size_divisor = ( + self.actor_rollout_wg.world_size + if not self.async_rollout_mode + else self.config.actor_rollout_ref.rollout.agent.num_workers + ) + test_gen_batch_padded, pad_size = pad_dataproto_to_divisor(test_gen_batch, size_divisor) + if not self.async_rollout_mode: + test_output_gen_batch_padded = self.actor_rollout_wg.generate_sequences(test_gen_batch_padded) + else: + test_output_gen_batch_padded = self.async_rollout_manager.generate_sequences(test_gen_batch_padded) + + # unpad + test_output_gen_batch = unpad_dataproto(test_output_gen_batch_padded, pad_size=pad_size) + + print("validation generation end") + + # Store generated outputs + output_ids = test_output_gen_batch.batch["responses"] + output_texts = [self.tokenizer.decode(ids, skip_special_tokens=True) for ids in output_ids] + sample_outputs.extend(output_texts) + + test_batch = test_batch.union(test_output_gen_batch) + test_batch.meta_info["validate"] = True + + # evaluate using reward_function + result = self.val_reward_fn(test_batch, return_dict=True) + reward_tensor = result["reward_tensor"] + scores = reward_tensor.sum(-1).cpu().tolist() + sample_scores.extend(scores) + + reward_extra_infos_dict["reward"].extend(scores) + print(f"len reward_extra_infos_dict['reward']: {len(reward_extra_infos_dict['reward'])}") + if "reward_extra_info" in result: + for key, lst in result["reward_extra_info"].items(): + reward_extra_infos_dict[key].extend(lst) + print(f"len reward_extra_infos_dict['{key}']: {len(reward_extra_infos_dict[key])}") + + # collect num_turns of each prompt + if "__num_turns__" in test_batch.non_tensor_batch: + sample_turns.append(test_batch.non_tensor_batch["__num_turns__"]) + + data_source_lst.append(test_batch.non_tensor_batch.get("data_source", ["unknown"] * reward_tensor.shape[0])) + + self._maybe_log_val_generations(inputs=sample_inputs, outputs=sample_outputs, scores=sample_scores) + + # dump generations + val_data_dir = self.config.trainer.get("validation_data_dir", None) + if val_data_dir: + self._dump_generations( + inputs=sample_inputs, + outputs=sample_outputs, + scores=sample_scores, + reward_extra_infos_dict=reward_extra_infos_dict, + dump_path=val_data_dir, + ) + + for key_info, lst in reward_extra_infos_dict.items(): + assert len(lst) == 0 or len(lst) == len(sample_scores), f"{key_info}: {len(lst)=}, {len(sample_scores)=}" + + data_sources = np.concatenate(data_source_lst, axis=0) + + data_src2var2metric2val = process_validation_metrics(data_sources, sample_inputs, reward_extra_infos_dict) + metric_dict = {} + for data_source, var2metric2val in data_src2var2metric2val.items(): + core_var = "acc" if "acc" in var2metric2val else "reward" + for var_name, metric2val in var2metric2val.items(): + n_max = max([int(name.split("@")[-1].split("/")[0]) for name in metric2val.keys()]) + for metric_name, metric_val in metric2val.items(): + if ( + (var_name == core_var) + and any(metric_name.startswith(pfx) for pfx in ["mean", "maj", "best"]) + and (f"@{n_max}" in metric_name) + ): + metric_sec = "val-core" + else: + metric_sec = "val-aux" + pfx = f"{metric_sec}/{data_source}/{var_name}/{metric_name}" + metric_dict[pfx] = metric_val + + if len(sample_turns) > 0: + sample_turns = np.concatenate(sample_turns) + metric_dict["val-aux/num_turns/min"] = sample_turns.min() + metric_dict["val-aux/num_turns/max"] = sample_turns.max() + metric_dict["val-aux/num_turns/mean"] = sample_turns.mean() + + return metric_dict + + def init_workers(self): + """Initialize distributed training workers using Ray backend. + + Creates: + 1. Ray resource pools from configuration + 2. Worker groups for each role (actor, critic, etc.) + """ + self._init_resource_pools() + self._create_worker_classes() + self._init_worker_groups() + self._init_models() + self._init_async_rollout_manager() + + def _init_resource_pools(self): + self.resource_pool_manager.create_resource_pool() + self.resource_pool_to_cls = {pool: {} for pool in self.resource_pool_manager.resource_pool_dict.values()} + + def _create_worker_classes(self): + self._create_actor_rollout_classes() + self._create_critic_class() + self._create_reference_policy_class() + self._create_reward_model_class() + + def _create_actor_rollout_classes(self): + # create actor and rollout + if self.hybrid_engine: + resource_pool = self.resource_pool_manager.get_resource_pool(Role.ActorRollout) + actor_rollout_cls = RayClassWithInitArgs( + cls=self.role_worker_mapping[Role.ActorRollout], + config=self.config.actor_rollout_ref, + role=str(Role.ActorRollout), + profile_option=self.config.trainer.npu_profile.options, + ) + self.resource_pool_to_cls[resource_pool][str(Role.ActorRollout)] = actor_rollout_cls + else: + raise NotImplementedError + + def _create_critic_class(self): + # create critic + if self.use_critic: + resource_pool = self.resource_pool_manager.get_resource_pool(Role.Critic) + critic_cfg = omega_conf_to_dataclass(self.config.critic) + critic_cls = RayClassWithInitArgs(cls=self.role_worker_mapping[Role.Critic], config=critic_cfg) + self.resource_pool_to_cls[resource_pool][str(Role.Critic)] = critic_cls + + def _create_reference_policy_class(self): + # create reference policy if needed + if self.use_reference_policy: + resource_pool = self.resource_pool_manager.get_resource_pool(Role.RefPolicy) + ref_policy_cls = RayClassWithInitArgs( + self.role_worker_mapping[Role.RefPolicy], + config=self.config.actor_rollout_ref, + role=str(Role.RefPolicy), + profile_option=self.config.trainer.npu_profile.options, + ) + self.resource_pool_to_cls[resource_pool][str(Role.RefPolicy)] = ref_policy_cls + + def _create_reward_model_class(self): + # create a reward model if reward_fn is None + if self.use_rm: + # we create a RM here + resource_pool = self.resource_pool_manager.get_resource_pool(Role.RewardModel) + rm_cls = RayClassWithInitArgs(self.role_worker_mapping[Role.RewardModel], config=self.config.reward_model) + self.resource_pool_to_cls[resource_pool][str(Role.RewardModel)] = rm_cls + + def _init_worker_groups(self): + # initialize WorkerGroup + # NOTE: if you want to use a different resource pool for each role, which can support different parallel size, + # you should not use `create_colocated_worker_cls`. + # Instead, directly pass different resource pool to different worker groups. + # See https://github.com/volcengine/verl/blob/master/examples/ray/tutorial.ipynb for more information. + all_wg = {} + wg_kwargs = {} # Setting up kwargs for RayWorkerGroup + if OmegaConf.select(self.config.trainer, "ray_wait_register_center_timeout") is not None: + wg_kwargs["ray_wait_register_center_timeout"] = self.config.trainer.ray_wait_register_center_timeout + if OmegaConf.select(self.config.trainer, "profile_steps") is not None: + wg_kwargs["profile_steps"] = OmegaConf.select(self.config.trainer, "profile_steps") + assert OmegaConf.select(self.config.trainer, "worker_nsight_options") is not None, ( + "worker_nsight_options must be set when profile_steps is set" + ) + wg_kwargs["worker_nsight_options"] = OmegaConf.to_container( + OmegaConf.select(self.config.trainer, "worker_nsight_options") + ) + wg_kwargs["device_name"] = self.device_name + + for resource_pool, class_dict in self.resource_pool_to_cls.items(): + worker_dict_cls = create_colocated_worker_cls(class_dict=class_dict) + wg_dict = self.ray_worker_group_cls( + resource_pool=resource_pool, + ray_cls_with_init=worker_dict_cls, + **wg_kwargs, + ) + spawn_wg = wg_dict.spawn(prefix_set=class_dict.keys()) + all_wg.update(spawn_wg) + self.all_wg = all_wg + + def _init_models(self): + if self.use_critic: + self.critic_wg = self.all_wg[str(Role.Critic)] + self.critic_wg.init_model() + + if self.use_reference_policy and not self.ref_in_actor: + self.ref_policy_wg = self.all_wg[str(Role.RefPolicy)] + self.ref_policy_wg.init_model() + + if self.use_rm: + self.rm_wg = self.all_wg[str(Role.RewardModel)] + self.rm_wg.init_model() + + # we should create rollout at the end so that vllm can have a better estimation of kv cache memory + self.actor_rollout_wg = self.all_wg[str(Role.ActorRollout)] + self.actor_rollout_wg.init_model() + + def _init_async_rollout_manager(self): + # create async rollout manager and request scheduler + self.async_rollout_mode = False + if self.config.actor_rollout_ref.rollout.mode == "async": + from verl.experimental.agent_loop import AgentLoopManager + + self.async_rollout_mode = True + self.async_rollout_manager = AgentLoopManager( + config=self.config, + worker_group=self.actor_rollout_wg, + ) + + def _save_checkpoint(self): + from verl.utils.fs import local_mkdir_safe + + # path: given_path + `/global_step_{global_steps}` + `/actor` + local_global_step_folder = os.path.join( + self.config.trainer.default_local_dir, f"global_step_{self.global_steps}" + ) + + print(f"local_global_step_folder: {local_global_step_folder}") + actor_local_path = os.path.join(local_global_step_folder, "actor") + + actor_remote_path = ( + None + if self.config.trainer.default_hdfs_dir is None + else os.path.join(self.config.trainer.default_hdfs_dir, f"global_step_{self.global_steps}", "actor") + ) + + remove_previous_ckpt_in_save = self.config.trainer.get("remove_previous_ckpt_in_save", False) + if remove_previous_ckpt_in_save: + print( + "Warning: remove_previous_ckpt_in_save is deprecated," + + " set max_actor_ckpt_to_keep=1 and max_critic_ckpt_to_keep=1 instead" + ) + max_actor_ckpt_to_keep = ( + self.config.trainer.get("max_actor_ckpt_to_keep", None) if not remove_previous_ckpt_in_save else 1 + ) + max_critic_ckpt_to_keep = ( + self.config.trainer.get("max_critic_ckpt_to_keep", None) if not remove_previous_ckpt_in_save else 1 + ) + + self.actor_rollout_wg.save_checkpoint( + actor_local_path, actor_remote_path, self.global_steps, max_ckpt_to_keep=max_actor_ckpt_to_keep + ) + + if self.use_critic: + critic_local_path = os.path.join(local_global_step_folder, "critic") + critic_remote_path = ( + None + if self.config.trainer.default_hdfs_dir is None + else os.path.join(self.config.trainer.default_hdfs_dir, f"global_step_{self.global_steps}", "critic") + ) + self.critic_wg.save_checkpoint( + critic_local_path, critic_remote_path, self.global_steps, max_ckpt_to_keep=max_critic_ckpt_to_keep + ) + + # save dataloader + local_mkdir_safe(local_global_step_folder) + dataloader_local_path = os.path.join(local_global_step_folder, "data.pt") + dataloader_state_dict = self.train_dataloader.state_dict() + torch.save(dataloader_state_dict, dataloader_local_path) + + # latest checkpointed iteration tracker (for atomic usage) + local_latest_checkpointed_iteration = os.path.join( + self.config.trainer.default_local_dir, "latest_checkpointed_iteration.txt" + ) + with open(local_latest_checkpointed_iteration, "w") as f: + f.write(str(self.global_steps)) + + def _load_checkpoint(self): + if self.config.trainer.resume_mode == "disable": + return 0 + + # load from hdfs + if self.config.trainer.default_hdfs_dir is not None: + raise NotImplementedError("load from hdfs is not implemented yet") + else: + checkpoint_folder = self.config.trainer.default_local_dir # TODO: check path + if not os.path.isabs(checkpoint_folder): + working_dir = os.getcwd() + checkpoint_folder = os.path.join(working_dir, checkpoint_folder) + global_step_folder = find_latest_ckpt_path(checkpoint_folder) # None if no latest + + # find global_step_folder + if self.config.trainer.resume_mode == "auto": + if global_step_folder is None: + print("Training from scratch") + return 0 + else: + if self.config.trainer.resume_mode == "resume_path": + assert isinstance(self.config.trainer.resume_from_path, str), "resume ckpt must be str type" + assert "global_step_" in self.config.trainer.resume_from_path, ( + "resume ckpt must specify the global_steps" + ) + global_step_folder = self.config.trainer.resume_from_path + if not os.path.isabs(global_step_folder): + working_dir = os.getcwd() + global_step_folder = os.path.join(working_dir, global_step_folder) + print(f"Load from checkpoint folder: {global_step_folder}") + # set global step + self.global_steps = int(global_step_folder.split("global_step_")[-1]) + + print(f"Setting global step to {self.global_steps}") + print(f"Resuming from {global_step_folder}") + + actor_path = os.path.join(global_step_folder, "actor") + critic_path = os.path.join(global_step_folder, "critic") + # load actor + self.actor_rollout_wg.load_checkpoint( + actor_path, del_local_after_load=self.config.trainer.del_local_ckpt_after_load + ) + # load critic + if self.use_critic: + self.critic_wg.load_checkpoint( + critic_path, del_local_after_load=self.config.trainer.del_local_ckpt_after_load + ) + + # load dataloader, + # TODO: from remote not implemented yet + dataloader_local_path = os.path.join(global_step_folder, "data.pt") + if os.path.exists(dataloader_local_path): + dataloader_state_dict = torch.load(dataloader_local_path, weights_only=False) + self.train_dataloader.load_state_dict(dataloader_state_dict) + else: + print(f"Warning: No dataloader state found at {dataloader_local_path}, will start from scratch") + + def _start_profiling(self, do_profile: bool, timing_raw) -> None: + """Start profiling for all worker groups if profiling is enabled.""" + with marked_timer("start_profile", timing_raw): + if do_profile: + self.actor_rollout_wg.start_profile(role="e2e", profile_step=self.global_steps) + if self.use_reference_policy: + self.ref_policy_wg.start_profile() + if self.use_critic: + self.critic_wg.start_profile() + if self.use_rm: + self.rm_wg.start_profile() + + def _stop_profiling(self, do_profile: bool, timing_raw) -> None: + """Stop profiling for all worker groups if profiling is enabled.""" + with marked_timer("stop_profile", timing_raw): + if do_profile: + self.actor_rollout_wg.stop_profile() + if self.use_reference_policy: + self.ref_policy_wg.stop_profile() + if self.use_critic: + self.critic_wg.stop_profile() + if self.use_rm: + self.rm_wg.stop_profile() + + def _balance_batch(self, batch: DataProto, metrics, logging_prefix="global_seqlen"): + """Reorder the data on single controller such that each dp rank gets similar total tokens""" + attention_mask = batch.batch["attention_mask"] + batch_size = attention_mask.shape[0] + global_seqlen_lst = batch.batch["attention_mask"].view(batch_size, -1).sum(-1).tolist() # (train_batch_size,) + world_size = self.actor_rollout_wg.world_size + global_partition_lst = get_seqlen_balanced_partitions( + global_seqlen_lst, k_partitions=world_size, equal_size=True + ) + # reorder based on index. The data will be automatically equally partitioned by dispatch function + global_idx = torch.tensor([j for partition in global_partition_lst for j in partition]) + batch.reorder(global_idx) + global_balance_stats = log_seqlen_unbalance( + seqlen_list=global_seqlen_lst, partitions=global_partition_lst, prefix=logging_prefix + ) + metrics.update(global_balance_stats) + + def fit(self): + """ + The training loop of PPO. + The driver process only need to call the compute functions of the worker group through RPC + to construct the PPO dataflow. + The light-weight advantage computation is done on the driver process. + """ + from omegaconf import OmegaConf + + from verl.utils.tracking import Tracking + + logger = Tracking( + project_name=self.config.trainer.project_name, + experiment_name=self.config.trainer.experiment_name, + default_backend=self.config.trainer.logger, + config=OmegaConf.to_container(self.config, resolve=True), + ) + + self.global_steps = 0 + + # load checkpoint before doing anything + self._load_checkpoint() + + # perform validation before training + # currently, we only support validation using the reward_function. + if self.val_reward_fn is not None and self.config.trainer.get("val_before_train", True): + val_metrics = self._validate() + assert val_metrics, f"{val_metrics=}" + pprint(f"Initial validation metrics: {val_metrics}") + logger.log(data=val_metrics, step=self.global_steps) + if self.config.trainer.get("val_only", False): + return + + # add tqdm + progress_bar = tqdm(total=self.total_training_steps, initial=self.global_steps, desc="Training Progress") + + # we start from step 1 + self.global_steps += 1 + last_val_metrics = None + self.max_steps_duration = 0 + + for epoch in range(self.config.trainer.total_epochs): + for batch_dict in self.train_dataloader: + metrics = {} + timing_raw = {} + + do_profile = ( + self.global_steps in self.config.trainer.profile_steps + if self.config.trainer.profile_steps is not None + else False + ) + self._start_profiling(do_profile, timing_raw) + + batch, gen_batch = self._prepare_generate_batch(batch_dict) + + is_last_step = self.global_steps >= self.total_training_steps + + with marked_timer("step", timing_raw): + # generate a batch + with marked_timer("gen", timing_raw, color="red"): + if not self.async_rollout_mode: + gen_batch_output = self.actor_rollout_wg.generate_sequences(gen_batch) + else: + gen_batch_output = self.async_rollout_manager.generate_sequences(gen_batch) + timing_raw.update(gen_batch_output.meta_info["timing"]) + gen_batch_output.meta_info.pop("timing", None) + + if self.config.algorithm.adv_estimator == AdvantageEstimator.REMAX: + with marked_timer("gen_max", timing_raw, color="purple"): + gen_baseline_batch = deepcopy(gen_batch) + gen_baseline_batch.meta_info["do_sample"] = False + if not self.async_rollout_mode: + gen_baseline_output = self.actor_rollout_wg.generate_sequences(gen_baseline_batch) + else: + gen_baseline_output = self.async_rollout_manager.generate_sequences(gen_baseline_batch) + batch = batch.union(gen_baseline_output) + reward_baseline_tensor = self.reward_fn(batch) + reward_baseline_tensor = reward_baseline_tensor.sum(dim=-1) + + batch.pop(batch_keys=list(gen_baseline_output.batch.keys())) + + batch.batch["reward_baselines"] = reward_baseline_tensor + + del gen_baseline_batch, gen_baseline_output + + batch = self._post_generate_batch(batch, gen_batch_output, metrics) + batch, reward_extra_infos_dict = self._process_batch_common(batch, metrics, timing_raw) + self._log_rollout(batch, reward_extra_infos_dict, timing_raw) + last_val_metrics = self._validate_metrics(is_last_step, last_val_metrics, metrics, timing_raw) + self._check_save_checkpoint(is_last_step, timing_raw) + + self._stop_profiling(do_profile, timing_raw) + self._collect_metrics(batch, epoch, metrics, timing_raw) + self._post_batch_processing(batch) + + # TODO: make a canonical logger that supports various backend + logger.log(data=metrics, step=self.global_steps) + + progress_bar.update(1) + self.global_steps += 1 + + if is_last_step: + pprint(f"Final validation metrics: {last_val_metrics}") + progress_bar.close() + return + + def _prepare_generate_batch(self, batch_dict): + batch: DataProto = DataProto.from_single_dict(batch_dict) + # pop those keys for generation + batch_keys_to_pop = ["input_ids", "attention_mask", "position_ids"] + non_tensor_batch_keys_to_pop = ["raw_prompt_ids"] + if "multi_modal_data" in batch.non_tensor_batch: + non_tensor_batch_keys_to_pop.append("multi_modal_data") + if "raw_prompt" in batch.non_tensor_batch: + non_tensor_batch_keys_to_pop.append("raw_prompt") + if "tools_kwargs" in batch.non_tensor_batch: + non_tensor_batch_keys_to_pop.append("tools_kwargs") + if "interaction_kwargs" in batch.non_tensor_batch: + non_tensor_batch_keys_to_pop.append("interaction_kwargs") + if "index" in batch.non_tensor_batch: + non_tensor_batch_keys_to_pop.append("index") + if "agent_name" in batch.non_tensor_batch: + non_tensor_batch_keys_to_pop.append("agent_name") + gen_batch = batch.pop( + batch_keys=batch_keys_to_pop, + non_tensor_batch_keys=non_tensor_batch_keys_to_pop, + ) + # pass global_steps to trace + gen_batch.meta_info["global_steps"] = self.global_steps + gen_batch = gen_batch.repeat(repeat_times=self.config.actor_rollout_ref.rollout.n, interleave=True) + return batch, gen_batch + + def _post_generate_batch(self, batch, gen_batch_output, metrics): + batch.non_tensor_batch["uid"] = np.array([str(uuid.uuid4()) for _ in range(len(batch.batch))], dtype=object) + # repeat to align with repeated responses in rollout + batch = batch.repeat(repeat_times=self.config.actor_rollout_ref.rollout.n, interleave=True) + batch = batch.union(gen_batch_output) + if "response_mask" not in batch.batch.keys(): + batch.batch["response_mask"] = compute_response_mask(batch) + # Balance the number of valid tokens across DP ranks. + # NOTE: This usually changes the order of data in the `batch`, + # which won't affect the advantage calculation (since it's based on uid), + # but might affect the loss calculation (due to the change of mini-batching). + # TODO: Decouple the DP balancing and mini-batching. + if self.config.trainer.balance_batch: + self._balance_batch(batch, metrics=metrics) + # compute global_valid tokens + batch.meta_info["global_token_num"] = torch.sum(batch.batch["attention_mask"], dim=-1).tolist() + return batch + + def _process_batch_common(self, batch, metrics, timing_raw): + with marked_timer("reward", timing_raw, color="yellow"): + # compute reward model score + if self.use_rm: + reward_tensor = self.rm_wg.compute_rm_score(batch) + batch = batch.union(reward_tensor) + + if self.config.reward_model.launch_reward_fn_async: + future_reward = compute_reward_async.remote(data=batch, reward_fn=self.reward_fn) + else: + reward_tensor, reward_extra_infos_dict = compute_reward(batch, self.reward_fn) + # recompute old_log_probs + with marked_timer("old_log_prob", timing_raw, color="blue"): + async_training = self.config.get("async_training", None) + if async_training and async_training.use_rollout_log_probs: + batch.batch["old_log_probs"] = batch.batch["rollout_log_probs"] + batch.meta_info["temperature"] = self.config.actor_rollout_ref.rollout.temperature + + else: + old_log_prob = self.actor_rollout_wg.compute_log_prob(batch) + entropys = old_log_prob.batch["entropys"] + response_masks = batch.batch["response_mask"] + loss_agg_mode = self.config.actor_rollout_ref.actor.loss_agg_mode + entropy_agg = agg_loss(loss_mat=entropys, loss_mask=response_masks, loss_agg_mode=loss_agg_mode) + old_log_prob_metrics = {"actor/entropy": entropy_agg.detach().item()} + metrics.update(old_log_prob_metrics) + old_log_prob.batch.pop("entropys") + batch = batch.union(old_log_prob) + + if "rollout_log_probs" in batch.batch.keys(): + # TODO: we may want to add diff of probs too. + rollout_old_log_probs = batch.batch["rollout_log_probs"] + actor_old_log_probs = batch.batch["old_log_probs"] + attention_mask = batch.batch["attention_mask"] + responses = batch.batch["responses"] + response_length = responses.size(1) + response_mask = attention_mask[:, -response_length:] + + rollout_probs = torch.exp(rollout_old_log_probs) + actor_probs = torch.exp(actor_old_log_probs) + rollout_probs_diff = torch.abs(rollout_probs - actor_probs) + rollout_probs_diff = torch.masked_select(rollout_probs_diff, response_mask.bool()) + rollout_probs_diff_max = torch.max(rollout_probs_diff) + rollout_probs_diff_mean = torch.mean(rollout_probs_diff) + rollout_probs_diff_std = torch.std(rollout_probs_diff) + metrics.update( + { + "training/rollout_probs_diff_max": rollout_probs_diff_max.detach().item(), + "training/rollout_probs_diff_mean": rollout_probs_diff_mean.detach().item(), + "training/rollout_probs_diff_std": rollout_probs_diff_std.detach().item(), + } + ) + + if self.use_reference_policy: + # compute reference log_prob + with marked_timer("ref", timing_raw, color="olive"): + if not self.ref_in_actor: + ref_log_prob = self.ref_policy_wg.compute_ref_log_prob(batch) + else: + ref_log_prob = self.actor_rollout_wg.compute_ref_log_prob(batch) + batch = batch.union(ref_log_prob) + # compute values + if self.use_critic: + with marked_timer("values", timing_raw, color="cyan"): + values = self.critic_wg.compute_values(batch) + batch = batch.union(values) + with marked_timer("adv", timing_raw, color="brown"): + # we combine with rule-based rm + reward_extra_infos_dict: dict[str, list] + if self.config.reward_model.launch_reward_fn_async: + reward_tensor, reward_extra_infos_dict = ray.get(future_reward) + batch.batch["token_level_scores"] = reward_tensor + + if reward_extra_infos_dict: + batch.non_tensor_batch.update({k: np.array(v) for k, v in reward_extra_infos_dict.items()}) + + # compute rewards. apply_kl_penalty if available + if self.config.algorithm.use_kl_in_reward: + batch, kl_metrics = apply_kl_penalty( + batch, kl_ctrl=self.kl_ctrl_in_reward, kl_penalty=self.config.algorithm.kl_penalty + ) + metrics.update(kl_metrics) + else: + batch.batch["token_level_rewards"] = batch.batch["token_level_scores"] + + # compute advantages, executed on the driver process + + norm_adv_by_std_in_grpo = self.config.algorithm.get( + "norm_adv_by_std_in_grpo", True + ) # GRPO adv normalization factor + + batch = compute_advantage( + batch, + adv_estimator=self.config.algorithm.adv_estimator, + gamma=self.config.algorithm.gamma, + lam=self.config.algorithm.lam, + num_repeat=self.config.actor_rollout_ref.rollout.n, + norm_adv_by_std_in_grpo=norm_adv_by_std_in_grpo, + config=self.config.algorithm, + ) + # update critic + if self.use_critic: + with marked_timer("update_critic", timing_raw, color="pink"): + critic_output = self.critic_wg.update_critic(batch) + critic_output_metrics = reduce_metrics(critic_output.meta_info["metrics"]) + metrics.update(critic_output_metrics) + # implement critic warmup + if self.config.trainer.critic_warmup <= self.global_steps: + # update actor + with marked_timer("update_actor", timing_raw, color="red"): + batch.meta_info["multi_turn"] = self.config.actor_rollout_ref.rollout.multi_turn.enable + actor_output = self.actor_rollout_wg.update_actor(batch) + actor_output_metrics = reduce_metrics(actor_output.meta_info["metrics"]) + metrics.update(actor_output_metrics) + return batch, reward_extra_infos_dict + + def _log_rollout(self, batch, reward_extra_infos_dict, timing_raw): + """Log rollout generations if enabled""" + rollout_data_dir = self.config.trainer.get("rollout_data_dir", None) + if rollout_data_dir: + with marked_timer("dump_rollout_generations", timing_raw, color="green"): + inputs = self.tokenizer.batch_decode(batch.batch["prompts"], skip_special_tokens=True) + outputs = self.tokenizer.batch_decode(batch.batch["responses"], skip_special_tokens=True) + scores = batch.batch["token_level_scores"].sum(-1).cpu().tolist() + if "request_id" in batch.non_tensor_batch: + reward_extra_infos_dict.setdefault( + "request_id", + batch.non_tensor_batch["request_id"].tolist(), + ) + self._dump_generations( + inputs=inputs, + outputs=outputs, + scores=scores, + reward_extra_infos_dict=reward_extra_infos_dict, + dump_path=rollout_data_dir, + ) + + def _validate_metrics(self, is_last_step, last_val_metrics, metrics, timing_raw): + if ( + self.val_reward_fn is not None + and self.config.trainer.test_freq > 0 + and (is_last_step or self.global_steps % self.config.trainer.test_freq == 0) + ): + with marked_timer("testing", timing_raw, color="green"): + val_metrics: dict = self._validate() + if is_last_step: + last_val_metrics = val_metrics + metrics.update(val_metrics) + return last_val_metrics + + def _check_save_checkpoint(self, is_last_step, timing_raw): + # Check if the ESI (Elastic Server Instance)/training plan is close to expiration. + esi_close_to_expiration = should_save_ckpt_esi( + max_steps_duration=self.max_steps_duration, + redundant_time=self.config.trainer.esi_redundant_time, + ) + # Check if the conditions for saving a checkpoint are met. + # The conditions include a mandatory condition (1) and + # one of the following optional conditions (2/3/4): + # 1. The save frequency is set to a positive value. + # 2. It's the last training step. + # 3. The current step number is a multiple of the save frequency. + # 4. The ESI(Elastic Server Instance)/training plan is close to expiration. + if self.config.trainer.save_freq > 0 and ( + is_last_step or self.global_steps % self.config.trainer.save_freq == 0 or esi_close_to_expiration + ): + if esi_close_to_expiration: + print("Force saving checkpoint: ESI instance expiration approaching.") + with marked_timer("save_checkpoint", timing_raw, color="green"): + self._save_checkpoint() + + def _collect_metrics(self, batch, epoch, metrics, timing_raw): + steps_duration = timing_raw["step"] + self.max_steps_duration = max(self.max_steps_duration, steps_duration) + # training metrics + metrics.update( + { + "training/global_step": self.global_steps, + "training/epoch": epoch, + } + ) + # collect metrics + metrics.update(compute_data_metrics(batch=batch, use_critic=self.use_critic)) + metrics.update(compute_timing_metrics(batch=batch, timing_raw=timing_raw)) + # TODO: implement actual tflpo and theoretical tflpo + n_gpus = self.resource_pool_manager.get_n_gpus() + metrics.update(compute_throughout_metrics(batch=batch, timing_raw=timing_raw, n_gpus=n_gpus)) + + def _post_batch_processing(self, batch: DataProto): + # this is experimental and may be changed/removed in the future in favor of a general-purpose one + if isinstance(self.train_dataloader.sampler, AbstractCurriculumSampler): + self.train_dataloader.sampler.update(batch=batch) + + # this is experimental and may be changed/removed in the future + # in favor of a general-purpose data buffer pool + if hasattr(self.train_dataset, "on_batch_end"): + # The dataset may be changed after each training batch + self.train_dataset.on_batch_end(batch=batch) From c20666039c4300056dce97bf0af1de4dca5142fc Mon Sep 17 00:00:00 2001 From: wangshulin02 <953550366@qq.com> Date: Mon, 15 Sep 2025 15:43:29 +0800 Subject: [PATCH 132/182] restore modified files in verl folder --- recipe/fully_async_policy/detach_utils.py | 2 +- recipe/fully_async_policy/ray_trainer.py | 2 +- verl/experimental/agent_loop/__init__.py | 5 +- verl/experimental/agent_loop/agent_loop.py | 268 ++------ .../partial_single_turn_agent_loop.py | 74 --- verl/trainer/main_ppo.py | 15 +- verl/trainer/ppo/ray_trainer.py | 618 ++++++++---------- .../rollout/vllm_rollout/vllm_async_server.py | 65 +- 8 files changed, 356 insertions(+), 693 deletions(-) delete mode 100644 verl/experimental/agent_loop/partial_single_turn_agent_loop.py diff --git a/recipe/fully_async_policy/detach_utils.py b/recipe/fully_async_policy/detach_utils.py index 18e45d50a16..75d67ec1ab1 100644 --- a/recipe/fully_async_policy/detach_utils.py +++ b/recipe/fully_async_policy/detach_utils.py @@ -20,7 +20,7 @@ import torch from verl import DataProto -from verl.experimental.agent_loop.agent_loop import postprocess_agent_loop_outputs +from recipe.fully_async_policy.agent_loop.agent_loop import postprocess_agent_loop_outputs from verl.trainer.ppo.ray_trainer import compute_response_mask diff --git a/recipe/fully_async_policy/ray_trainer.py b/recipe/fully_async_policy/ray_trainer.py index 56a1e5bcab1..dea3aa2c26e 100644 --- a/recipe/fully_async_policy/ray_trainer.py +++ b/recipe/fully_async_policy/ray_trainer.py @@ -920,7 +920,7 @@ def _init_async_rollout_manager(self): # create async rollout manager and request scheduler self.async_rollout_mode = False if self.config.actor_rollout_ref.rollout.mode == "async": - from verl.experimental.agent_loop import AgentLoopManager + from recipe.fully_async_policy.agent_loop.agent_loop import AgentLoopManager self.async_rollout_mode = True self.async_rollout_manager = AgentLoopManager( diff --git a/verl/experimental/agent_loop/__init__.py b/verl/experimental/agent_loop/__init__.py index 67dcb16047e..c6f58f83c83 100644 --- a/verl/experimental/agent_loop/__init__.py +++ b/verl/experimental/agent_loop/__init__.py @@ -13,10 +13,9 @@ # limitations under the License. from .agent_loop import AgentLoopBase, AgentLoopManager -from .partial_single_turn_agent_loop import PartialSingleTurnAgentLoop from .single_turn_agent_loop import SingleTurnAgentLoop from .tool_agent_loop import ToolAgentLoop -_ = [SingleTurnAgentLoop, ToolAgentLoop, PartialSingleTurnAgentLoop] +_ = [SingleTurnAgentLoop, ToolAgentLoop] -__all__ = ["AgentLoopBase", "AgentLoopManager"] +__all__ = ["AgentLoopBase", "AgentLoopManager"] \ No newline at end of file diff --git a/verl/experimental/agent_loop/agent_loop.py b/verl/experimental/agent_loop/agent_loop.py index ddcad093326..4639229a3b0 100644 --- a/verl/experimental/agent_loop/agent_loop.py +++ b/verl/experimental/agent_loop/agent_loop.py @@ -17,7 +17,7 @@ import os import random from abc import ABC, abstractmethod -from typing import Any, Optional +from typing import Any import hydra import numpy as np @@ -103,16 +103,6 @@ async def generate( ) return output - async def generate_for_partial(self, request_id, prompt_ids, sampling_params): - """Generate tokens from prompt ids. with partial rollout function""" - server = self._choose_server(request_id) - output = await server.generate_for_partial.remote( - request_id=request_id, - prompt_ids=prompt_ids, - sampling_params=sampling_params, - ) - return output - class AgentLoopMetrics(BaseModel): """Agent loop performance metrics.""" @@ -134,10 +124,6 @@ class AgentLoopOutput(BaseModel): """Number of chat turns, including user, assistant, tool.""" metrics: AgentLoopMetrics """Auxiliary performance metrics""" - is_cancel: bool = False - """Indicates whether the request was interrupted""" - log_probs: list[float] = None - """Response token log probs including LLM generated token, tool response token.""" # make hydra.utils.instantiate happy @@ -214,81 +200,6 @@ def decorator(subclass: type[AgentLoopBase]) -> type[AgentLoopBase]: return decorator -def postprocess_agent_loop_outputs(inputs: list[AgentLoopOutput], tokenizer, config) -> DataProto: - """Static method to postprocess a list of AgentLoopOutput into DataProto - - Args: - inputs: List of AgentLoopOutput - tokenizer: Tokenizer instance - config: Configuration object - - Returns: - DataProto: Processed batch data - """ - # NOTE: consistent with batch version of generate_sequences in vllm_rollout_spmd.py - # prompts: left pad - # responses: right pad - # input_ids: prompt + response - # attention_mask: [0,0,0,0,1,1,1,1, | 1,1,1,0,0,0,0,0] - # position_ids: [0,0,0,0,0,1,2,3, | 4,5,6,7,8,9,10,11] - - # prompts - tokenizer.padding_side = "left" - outputs = tokenizer.pad( - [{"input_ids": input.prompt_ids} for input in inputs], - padding="max_length", - max_length=config.actor_rollout_ref.rollout.prompt_length, - return_tensors="pt", - return_attention_mask=True, - ) - prompt_ids, prompt_attention_mask = outputs["input_ids"], outputs["attention_mask"] - - # responses - tokenizer.padding_side = "right" - outputs = tokenizer.pad( - [{"input_ids": input.response_ids} for input in inputs], - padding="max_length", - max_length=config.actor_rollout_ref.rollout.response_length, - return_tensors="pt", - return_attention_mask=True, - ) - response_ids, response_attention_mask = outputs["input_ids"], outputs["attention_mask"] - - # response_mask - outputs = tokenizer.pad( - [{"input_ids": input.response_mask} for input in inputs], - padding="max_length", - max_length=config.actor_rollout_ref.rollout.response_length, - return_tensors="pt", - return_attention_mask=False, - ) - response_mask = outputs["input_ids"] - assert response_ids.shape == response_mask.shape, ( - f"mismatch in response_ids and response_mask shape: {response_ids.shape} vs {response_mask.shape}" - ) - response_mask = response_mask * response_attention_mask - - input_ids = torch.cat([prompt_ids, response_ids], dim=1) - attention_mask = torch.cat([prompt_attention_mask, response_attention_mask], dim=1) - position_ids = (attention_mask.cumsum(dim=1) - 1) * attention_mask - - batch = TensorDict( - { - "prompts": prompt_ids, # [bsz, prompt_length] - "responses": response_ids, # [bsz, response_length] - "response_mask": response_mask, # [bsz, response_length] - "input_ids": input_ids, # [bsz, prompt_length + response_length] - "attention_mask": attention_mask, # [bsz, prompt_length + response_length] - "position_ids": position_ids, # [bsz, prompt_length + response_length] - }, - batch_size=len(input_ids), - ) - - num_turns = np.array([input.num_turns for input in inputs], dtype=np.int32) - metrics = [input.metrics.model_dump() for input in inputs] - return DataProto(batch=batch, non_tensor_batch={"__num_turns__": num_turns}, meta_info={"metrics": metrics}) - - @ray.remote class AgentLoopWorker: """Agent loop worker takes a batch of messages and run each message in an agent loop.""" @@ -378,76 +289,15 @@ async def generate_sequences(self, batch: DataProto) -> DataProto: ) outputs = await asyncio.gather(*tasks) - output = postprocess_agent_loop_outputs(outputs, self.tokenizer, self.config) + output = self._postprocess(outputs) return output - async def generate_sequences_no_post( - self, batch: DataProto, partial_output_list: Optional[list[AgentLoopOutput]] - ) -> list[AgentLoopOutput]: - """Generate sequences from agent loop. - - Args: - batch (DataProto): Input batch. - partial_output_list: Optional[List[AgentLoopOutput]]: already rollout result. - - Returns: - list[AgentLoopOutput]: List of agent loop outputs, one per sample in the batch. - Each AgentLoopOutput contains: - - prompt_ids: prompt token ids - - response_ids: response token ids including LLM generated and tool response tokens - - response_mask: 1 for LLM generated tokens, 0 for tool response tokens - - num_turns: number of chat turns - - metrics: performance metrics - """ - config = self.config.actor_rollout_ref.rollout - sampling_params = dict( - temperature=config.temperature, - top_p=config.top_p, - repetition_penalty=1.0, - ) - - # override sampling params for validation - if batch.meta_info.get("validate", False): - sampling_params["top_p"] = config.val_kwargs.top_p - sampling_params["temperature"] = config.val_kwargs.temperature - - # by default, we assume it's a single turn agent - if "agent_name" not in batch.non_tensor_batch: - batch.non_tensor_batch["agent_name"] = np.array(["single_turn_agent"] * len(batch), dtype=object) - - tasks = [] - agent_names = batch.non_tensor_batch["agent_name"] - raw_prompts = batch.non_tensor_batch["raw_prompt"] - if "index" in batch.non_tensor_batch: - index = batch.non_tensor_batch["index"] - else: - index = np.arange(len(raw_prompts)) - - trajectory_info = await get_trajectory_info( - batch.meta_info.get("global_steps", -1), index, batch.meta_info.get("validate", False) - ) - if not partial_output_list: - partial_output_list = [None] * len(batch) - - for agent_name, messages, trajectory, partial_output in zip( - agent_names, raw_prompts, trajectory_info, partial_output_list, strict=True - ): - tasks.append( - asyncio.create_task( - self._run_agent_loop(agent_name, messages.tolist(), sampling_params, trajectory, partial_output) - ) - ) - outputs = await asyncio.gather(*tasks) - - return outputs - async def _run_agent_loop( self, agent_name: str, messages: list[dict[str, Any]], sampling_params: dict[str, Any], trajectory: dict[str, Any], - partial_output: Optional[AgentLoopOutput] = None, ) -> AgentLoopOutput: with rollout_trace_attr( step=trajectory["step"], @@ -459,6 +309,7 @@ async def _run_agent_loop( assert agent_name in _agent_loop_registry, ( f"Agent loop {agent_name} not registered, registered agent loops: {_agent_loop_registry.keys()}" ) + agent_loop_config = _agent_loop_registry[agent_name] agent_loop = hydra.utils.instantiate( config=agent_loop_config, @@ -466,12 +317,73 @@ async def _run_agent_loop( server_manager=self.server_manager, tokenizer=self.tokenizer, ) - if agent_name == "partial_single_turn_agent": - output = await agent_loop.run(messages, sampling_params, partial_output) - else: - output = await agent_loop.run(messages, sampling_params) + output = await agent_loop.run(messages, sampling_params) return output + def _postprocess(self, inputs: list[AgentLoopOutput]) -> DataProto: + # NOTE: consistent with batch version of generate_sequences in vllm_rollout_spmd.py + # prompts: left pad + # responses: right pad + # input_ids: prompt + response + # attention_mask: [0,0,0,0,1,1,1,1, | 1,1,1,0,0,0,0,0] + # position_ids: [0,0,0,0,0,1,2,3, | 4,5,6,7,8,9,10,11] + + # prompts + self.tokenizer.padding_side = "left" + outputs = self.tokenizer.pad( + [{"input_ids": input.prompt_ids} for input in inputs], + padding="max_length", + max_length=self.config.actor_rollout_ref.rollout.prompt_length, + return_tensors="pt", + return_attention_mask=True, + ) + prompt_ids, prompt_attention_mask = outputs["input_ids"], outputs["attention_mask"] + + # responses + self.tokenizer.padding_side = "right" + outputs = self.tokenizer.pad( + [{"input_ids": input.response_ids} for input in inputs], + padding="max_length", + max_length=self.config.actor_rollout_ref.rollout.response_length, + return_tensors="pt", + return_attention_mask=True, + ) + response_ids, response_attention_mask = outputs["input_ids"], outputs["attention_mask"] + + # response_mask + outputs = self.tokenizer.pad( + [{"input_ids": input.response_mask} for input in inputs], + padding="max_length", + max_length=self.config.actor_rollout_ref.rollout.response_length, + return_tensors="pt", + return_attention_mask=False, + ) + response_mask = outputs["input_ids"] + assert response_ids.shape == response_mask.shape, ( + f"mismatch in response_ids and response_mask shape: {response_ids.shape} vs {response_mask.shape}" + ) + response_mask = response_mask * response_attention_mask + + input_ids = torch.cat([prompt_ids, response_ids], dim=1) + attention_mask = torch.cat([prompt_attention_mask, response_attention_mask], dim=1) + position_ids = (attention_mask.cumsum(dim=1) - 1) * attention_mask + + batch = TensorDict( + { + "prompts": prompt_ids, # [bsz, prompt_length] + "responses": response_ids, # [bsz, response_length] + "response_mask": response_mask, # [bsz, response_length] + "input_ids": input_ids, # [bsz, prompt_length + response_length] + "attention_mask": attention_mask, # [bsz, prompt_length + response_length] + "position_ids": position_ids, # [bsz, prompt_length + response_length] + }, + batch_size=len(input_ids), + ) + + num_turns = np.array([input.num_turns for input in inputs], dtype=np.int32) + metrics = [input.metrics.model_dump() for input in inputs] + return DataProto(batch=batch, non_tensor_batch={"__num_turns__": num_turns}, meta_info={"metrics": metrics}) + async def get_trajectory_info(step, index, validate): """Get trajectory info. @@ -503,7 +415,7 @@ def __init__(self, config: DictConfig, worker_group: RayWorkerGroup): Args: config (DictConfig): trainer config. - worker_group (RayWorkerGroup): AsyncActorRolloutRefWorker worker group. + worker_group (RayWorkerGroup): ActorRolloutRef worker group. """ self.config = config self.worker_group = worker_group @@ -600,36 +512,6 @@ def generate_sequences(self, prompts: DataProto) -> DataProto: output.meta_info = {"timing": timing} return output - async def generate_single_sample_async( - self, - sample: DataProto, - partial_output_list: Optional[list[AgentLoopOutput]], - ) -> list[AgentLoopOutput]: - """ - 异步处理单个样本, 需要复制n次 - - Args: - sample: 单个样本数据 - partial_output_list: Optional[List[AgentLoopOutput]]: already rollout result. - - Returns: - tuple[AgentLoopOutput, float]: 处理结果和处理时间 - """ - # 使用负载均衡选择 worker - worker = self._select_best_worker() - # 异步处理单个样本 - 使用无后处理版本获取原始AgentLoopOutput - output_future = worker.generate_sequences_no_post.remote(sample, partial_output_list) - return await asyncio.wrap_future(output_future.future()) - - def _select_best_worker(self): - """选择最佳的 worker(简单的轮询负载均衡)""" - if not hasattr(self, "_worker_index"): - self._worker_index = 0 - - worker = self.agent_loop_workers[self._worker_index] - self._worker_index = (self._worker_index + 1) % len(self.agent_loop_workers) - return worker - def _performance_metrics(self, metrics: list[list[dict[str, str]]], output: DataProto) -> dict[str, float]: timing = {} t_generate_sequences = np.array([metric["generate_sequences"] for chunk in metrics for metric in chunk]) @@ -658,14 +540,4 @@ def wake_up(self): def sleep(self): """Sleep all rollout server instances.""" - ray.get([server.sleep.remote() for server in self.async_llm_servers]) - - async def cancel_async(self): - """Cancel all rollout tasks asynchronously.""" - futures = [server.cancel.remote() for server in self.async_llm_servers] - await asyncio.gather(*[asyncio.wrap_future(future.future()) for future in futures], return_exceptions=True) - - async def resume_async(self): - """Cancel all rollout tasks asynchronously.""" - futures = [server.resume.remote() for server in self.async_llm_servers] - await asyncio.gather(*[asyncio.wrap_future(future.future()) for future in futures], return_exceptions=True) + ray.get([server.sleep.remote() for server in self.async_llm_servers]) \ No newline at end of file diff --git a/verl/experimental/agent_loop/partial_single_turn_agent_loop.py b/verl/experimental/agent_loop/partial_single_turn_agent_loop.py deleted file mode 100644 index df4a4f3350a..00000000000 --- a/verl/experimental/agent_loop/partial_single_turn_agent_loop.py +++ /dev/null @@ -1,74 +0,0 @@ -# Copyright 2024 Bytedance Ltd. and/or its affiliates -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -import logging -import os -from typing import Any, Optional -from uuid import uuid4 - -from verl.experimental.agent_loop.agent_loop import AgentLoopBase, AgentLoopOutput, register -from verl.utils.profiler import simple_timer - -logger = logging.getLogger(__file__) -logger.setLevel(os.getenv("VERL_LOGGING_LEVEL", "WARN")) - - -@register("partial_single_turn_agent") -class PartialSingleTurnAgentLoop(AgentLoopBase): - """Naive agent loop that only do single turn chat completion.""" - - def __init__(self, *args, **kwargs): - super().__init__(*args, **kwargs) - self.prompt_length = self.config.actor_rollout_ref.rollout.prompt_length - self.response_length = self.config.actor_rollout_ref.rollout.response_length - - async def run( - self, messages: list[dict[str, Any]], sampling_params: dict[str, Any], output: Optional[AgentLoopOutput] - ) -> AgentLoopOutput: - if not output: - prompt_ids = await self.loop.run_in_executor( - None, lambda: self.tokenizer.apply_chat_template(messages, add_generation_prompt=True, tokenize=True) - ) - else: - if output.is_cancel: - # 恢复暂停的样本,结果直接添加到 prompt_ids 后面 - prompt_ids = output.prompt_ids + output.response_ids - else: - # 同一批样本,部分cancel,部分没有cancel, 没有cancel的样本直接返回 - return output - - metrics = {} - request_id = uuid4().hex - with simple_timer("generate_sequences", metrics): - response_ids, log_probs, is_cancel = await self.server_manager.generate_for_partial( - request_id=request_id, prompt_ids=prompt_ids, sampling_params=sampling_params - ) - - if not output: - response_mask = [1] * len(response_ids) - # 暂停待恢复样本, 把输出结果加到 response_ids 后,并重置 response_mask - else: - prompt_ids = output.prompt_ids - log_probs = output.log_probs + log_probs - response_ids = output.response_ids + response_ids - response_mask = [1] * len(response_ids) - - return AgentLoopOutput( - prompt_ids=prompt_ids, - response_ids=response_ids[: self.response_length], - response_mask=response_mask[: self.response_length], - num_turns=2, - metrics=metrics, - is_cancel=is_cancel, - log_probs=log_probs, - ) diff --git a/verl/trainer/main_ppo.py b/verl/trainer/main_ppo.py index 4b240c6ffbf..8d2b811c733 100644 --- a/verl/trainer/main_ppo.py +++ b/verl/trainer/main_ppo.py @@ -37,15 +37,11 @@ def main(config): Args: config_dict: Hydra configuration dictionary containing training parameters. """ - from time import time - - start_time = time() run_ppo(config) - print(f"total time: {time() - start_time:.2f} seconds") # Define a function to run the PPO-like training process -def run_ppo(config, task_runner_class=None) -> None: +def run_ppo(config) -> None: """Initialize Ray cluster and run distributed PPO training process. Args: @@ -63,9 +59,6 @@ def run_ppo(config, task_runner_class=None) -> None: runtime_env=get_ppo_ray_runtime_env(), num_cpus=config.ray_init.num_cpus, ) - # for recipe to change TaskRunner - if task_runner_class is None: - task_runner_class = TaskRunner # Create a remote instance of the TaskRunner class, and # Execute the `run` method of the TaskRunner instance remotely and wait for it to complete @@ -75,9 +68,9 @@ def run_ppo(config, task_runner_class=None) -> None: and len(config.trainer.get("profile_steps", [])) > 0 ): nsight_options = OmegaConf.to_container(config.trainer.controller_nsight_options) - runner = task_runner_class.options(runtime_env={"nsight": nsight_options}).remote() + runner = TaskRunner.options(runtime_env={"nsight": nsight_options}).remote() else: - runner = task_runner_class.remote() + runner = TaskRunner.remote() ray.get(runner.run.remote(config)) # [Optional] get the path of the timeline trace file from the configuration, default to None @@ -341,4 +334,4 @@ def create_rl_sampler(data_config, dataset): if __name__ == "__main__": - main() + main() \ No newline at end of file diff --git a/verl/trainer/ppo/ray_trainer.py b/verl/trainer/ppo/ray_trainer.py index 56a1e5bcab1..05281ebe3f9 100644 --- a/verl/trainer/ppo/ray_trainer.py +++ b/verl/trainer/ppo/ray_trainer.py @@ -79,40 +79,6 @@ class Role(Enum): RewardModel = 5 ActorRolloutRef = 6 - def __str__(self): - """返回与代码中一致的字符串表示""" - return self._get_role_string() - - def _get_role_string(self): - """获取角色对应的字符串名称""" - role_mapping = { - Role.Actor: "actor", - Role.Rollout: "rollout", - Role.ActorRollout: "actor_rollout", - Role.Critic: "critic", - Role.RefPolicy: "ref", - Role.RewardModel: "rm", - Role.ActorRolloutRef: "actor_rollout_ref", - } - return role_mapping.get(self, self.name.lower()) - - @classmethod - def from_string(cls, name: str): - """从字符串创建Role实例""" - string_mapping = { - "actor": cls.Actor, - "rollout": cls.Rollout, - "actor_rollout": cls.ActorRollout, - "critic": cls.Critic, - "ref": cls.RefPolicy, - "rm": cls.RewardModel, - "actor_rollout_ref": cls.ActorRolloutRef, - } - role = string_mapping.get(name.lower()) - if role is None: - raise ValueError(f"No Role found for string: {name}") - return role - @dataclass class ResourcePoolManager: @@ -438,15 +404,15 @@ def _validate_config(self): megatron_dp = n_gpus // ( model_parallel_size * config.actor_rollout_ref.actor.megatron.context_parallel_size ) - self.minimal_bsz = megatron_dp * config.actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu + minimal_bsz = megatron_dp * config.actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu else: - self.minimal_bsz = n_gpus + minimal_bsz = n_gpus # 1. Check total batch size for data correctness real_train_batch_size = config.data.train_batch_size * config.actor_rollout_ref.rollout.n - assert real_train_batch_size % self.minimal_bsz == 0, ( + assert real_train_batch_size % minimal_bsz == 0, ( f"real_train_batch_size ({real_train_batch_size}) must be divisible by minimal possible batch size " - f"({self.minimal_bsz})" + f"({minimal_bsz})" ) # A helper function to check "micro_batch_size" vs "micro_batch_size_per_gpu" @@ -810,65 +776,48 @@ def init_workers(self): 1. Ray resource pools from configuration 2. Worker groups for each role (actor, critic, etc.) """ - self._init_resource_pools() - self._create_worker_classes() - self._init_worker_groups() - self._init_models() - self._init_async_rollout_manager() - - def _init_resource_pools(self): self.resource_pool_manager.create_resource_pool() - self.resource_pool_to_cls = {pool: {} for pool in self.resource_pool_manager.resource_pool_dict.values()} - def _create_worker_classes(self): - self._create_actor_rollout_classes() - self._create_critic_class() - self._create_reference_policy_class() - self._create_reward_model_class() + self.resource_pool_to_cls = {pool: {} for pool in self.resource_pool_manager.resource_pool_dict.values()} - def _create_actor_rollout_classes(self): # create actor and rollout if self.hybrid_engine: resource_pool = self.resource_pool_manager.get_resource_pool(Role.ActorRollout) actor_rollout_cls = RayClassWithInitArgs( cls=self.role_worker_mapping[Role.ActorRollout], config=self.config.actor_rollout_ref, - role=str(Role.ActorRollout), + role="actor_rollout", profile_option=self.config.trainer.npu_profile.options, ) - self.resource_pool_to_cls[resource_pool][str(Role.ActorRollout)] = actor_rollout_cls + self.resource_pool_to_cls[resource_pool]["actor_rollout"] = actor_rollout_cls else: raise NotImplementedError - def _create_critic_class(self): # create critic if self.use_critic: resource_pool = self.resource_pool_manager.get_resource_pool(Role.Critic) critic_cfg = omega_conf_to_dataclass(self.config.critic) critic_cls = RayClassWithInitArgs(cls=self.role_worker_mapping[Role.Critic], config=critic_cfg) - self.resource_pool_to_cls[resource_pool][str(Role.Critic)] = critic_cls + self.resource_pool_to_cls[resource_pool]["critic"] = critic_cls - def _create_reference_policy_class(self): # create reference policy if needed if self.use_reference_policy: resource_pool = self.resource_pool_manager.get_resource_pool(Role.RefPolicy) ref_policy_cls = RayClassWithInitArgs( self.role_worker_mapping[Role.RefPolicy], config=self.config.actor_rollout_ref, - role=str(Role.RefPolicy), + role="ref", profile_option=self.config.trainer.npu_profile.options, ) - self.resource_pool_to_cls[resource_pool][str(Role.RefPolicy)] = ref_policy_cls + self.resource_pool_to_cls[resource_pool]["ref"] = ref_policy_cls - def _create_reward_model_class(self): # create a reward model if reward_fn is None if self.use_rm: # we create a RM here resource_pool = self.resource_pool_manager.get_resource_pool(Role.RewardModel) rm_cls = RayClassWithInitArgs(self.role_worker_mapping[Role.RewardModel], config=self.config.reward_model) - self.resource_pool_to_cls[resource_pool][str(Role.RewardModel)] = rm_cls + self.resource_pool_to_cls[resource_pool]["rm"] = rm_cls - def _init_worker_groups(self): # initialize WorkerGroup # NOTE: if you want to use a different resource pool for each role, which can support different parallel size, # you should not use `create_colocated_worker_cls`. @@ -897,26 +846,23 @@ def _init_worker_groups(self): ) spawn_wg = wg_dict.spawn(prefix_set=class_dict.keys()) all_wg.update(spawn_wg) - self.all_wg = all_wg - def _init_models(self): if self.use_critic: - self.critic_wg = self.all_wg[str(Role.Critic)] + self.critic_wg = all_wg["critic"] self.critic_wg.init_model() if self.use_reference_policy and not self.ref_in_actor: - self.ref_policy_wg = self.all_wg[str(Role.RefPolicy)] + self.ref_policy_wg = all_wg["ref"] self.ref_policy_wg.init_model() if self.use_rm: - self.rm_wg = self.all_wg[str(Role.RewardModel)] + self.rm_wg = all_wg["rm"] self.rm_wg.init_model() # we should create rollout at the end so that vllm can have a better estimation of kv cache memory - self.actor_rollout_wg = self.all_wg[str(Role.ActorRollout)] + self.actor_rollout_wg = all_wg["actor_rollout"] self.actor_rollout_wg.init_model() - def _init_async_rollout_manager(self): # create async rollout manager and request scheduler self.async_rollout_mode = False if self.config.actor_rollout_ref.rollout.mode == "async": @@ -1043,29 +989,27 @@ def _load_checkpoint(self): else: print(f"Warning: No dataloader state found at {dataloader_local_path}, will start from scratch") - def _start_profiling(self, do_profile: bool, timing_raw) -> None: + def _start_profiling(self, do_profile: bool) -> None: """Start profiling for all worker groups if profiling is enabled.""" - with marked_timer("start_profile", timing_raw): - if do_profile: - self.actor_rollout_wg.start_profile(role="e2e", profile_step=self.global_steps) - if self.use_reference_policy: - self.ref_policy_wg.start_profile() - if self.use_critic: - self.critic_wg.start_profile() - if self.use_rm: - self.rm_wg.start_profile() - - def _stop_profiling(self, do_profile: bool, timing_raw) -> None: + if do_profile: + self.actor_rollout_wg.start_profile(role="e2e", profile_step=self.global_steps) + if self.use_reference_policy: + self.ref_policy_wg.start_profile() + if self.use_critic: + self.critic_wg.start_profile() + if self.use_rm: + self.rm_wg.start_profile() + + def _stop_profiling(self, do_profile: bool) -> None: """Stop profiling for all worker groups if profiling is enabled.""" - with marked_timer("stop_profile", timing_raw): - if do_profile: - self.actor_rollout_wg.stop_profile() - if self.use_reference_policy: - self.ref_policy_wg.stop_profile() - if self.use_critic: - self.critic_wg.stop_profile() - if self.use_rm: - self.rm_wg.stop_profile() + if do_profile: + self.actor_rollout_wg.stop_profile() + if self.use_reference_policy: + self.ref_policy_wg.stop_profile() + if self.use_critic: + self.critic_wg.stop_profile() + if self.use_rm: + self.rm_wg.stop_profile() def _balance_batch(self, batch: DataProto, metrics, logging_prefix="global_seqlen"): """Reorder the data on single controller such that each dp rank gets similar total tokens""" @@ -1135,9 +1079,35 @@ def fit(self): if self.config.trainer.profile_steps is not None else False ) - self._start_profiling(do_profile, timing_raw) + with marked_timer("start_profile", timing_raw): + self._start_profiling(do_profile) + + batch: DataProto = DataProto.from_single_dict(batch_dict) + + # pop those keys for generation + batch_keys_to_pop = ["input_ids", "attention_mask", "position_ids"] + non_tensor_batch_keys_to_pop = ["raw_prompt_ids"] + if "multi_modal_data" in batch.non_tensor_batch: + non_tensor_batch_keys_to_pop.append("multi_modal_data") + if "raw_prompt" in batch.non_tensor_batch: + non_tensor_batch_keys_to_pop.append("raw_prompt") + if "tools_kwargs" in batch.non_tensor_batch: + non_tensor_batch_keys_to_pop.append("tools_kwargs") + if "interaction_kwargs" in batch.non_tensor_batch: + non_tensor_batch_keys_to_pop.append("interaction_kwargs") + if "index" in batch.non_tensor_batch: + non_tensor_batch_keys_to_pop.append("index") + if "agent_name" in batch.non_tensor_batch: + non_tensor_batch_keys_to_pop.append("agent_name") + + gen_batch = batch.pop( + batch_keys=batch_keys_to_pop, + non_tensor_batch_keys=non_tensor_batch_keys_to_pop, + ) - batch, gen_batch = self._prepare_generate_batch(batch_dict) + # pass global_steps to trace + gen_batch.meta_info["global_steps"] = self.global_steps + gen_batch = gen_batch.repeat(repeat_times=self.config.actor_rollout_ref.rollout.n, interleave=True) is_last_step = self.global_steps >= self.total_training_steps @@ -1169,15 +1139,216 @@ def fit(self): del gen_baseline_batch, gen_baseline_output - batch = self._post_generate_batch(batch, gen_batch_output, metrics) - batch, reward_extra_infos_dict = self._process_batch_common(batch, metrics, timing_raw) - self._log_rollout(batch, reward_extra_infos_dict, timing_raw) - last_val_metrics = self._validate_metrics(is_last_step, last_val_metrics, metrics, timing_raw) - self._check_save_checkpoint(is_last_step, timing_raw) + batch.non_tensor_batch["uid"] = np.array( + [str(uuid.uuid4()) for _ in range(len(batch.batch))], dtype=object + ) + # repeat to align with repeated responses in rollout + batch = batch.repeat(repeat_times=self.config.actor_rollout_ref.rollout.n, interleave=True) + batch = batch.union(gen_batch_output) + + if "response_mask" not in batch.batch.keys(): + batch.batch["response_mask"] = compute_response_mask(batch) + # Balance the number of valid tokens across DP ranks. + # NOTE: This usually changes the order of data in the `batch`, + # which won't affect the advantage calculation (since it's based on uid), + # but might affect the loss calculation (due to the change of mini-batching). + # TODO: Decouple the DP balancing and mini-batching. + if self.config.trainer.balance_batch: + self._balance_batch(batch, metrics=metrics) + + # compute global_valid tokens + batch.meta_info["global_token_num"] = torch.sum(batch.batch["attention_mask"], dim=-1).tolist() + + with marked_timer("reward", timing_raw, color="yellow"): + # compute reward model score + if self.use_rm: + reward_tensor = self.rm_wg.compute_rm_score(batch) + batch = batch.union(reward_tensor) + + if self.config.reward_model.launch_reward_fn_async: + future_reward = compute_reward_async.remote(data=batch, reward_fn=self.reward_fn) + else: + reward_tensor, reward_extra_infos_dict = compute_reward(batch, self.reward_fn) + + # recompute old_log_probs + with marked_timer("old_log_prob", timing_raw, color="blue"): + old_log_prob = self.actor_rollout_wg.compute_log_prob(batch) + entropys = old_log_prob.batch["entropys"] + response_masks = batch.batch["response_mask"] + loss_agg_mode = self.config.actor_rollout_ref.actor.loss_agg_mode + entropy_agg = agg_loss(loss_mat=entropys, loss_mask=response_masks, loss_agg_mode=loss_agg_mode) + old_log_prob_metrics = {"actor/entropy": entropy_agg.detach().item()} + metrics.update(old_log_prob_metrics) + old_log_prob.batch.pop("entropys") + batch = batch.union(old_log_prob) + + if "rollout_log_probs" in batch.batch.keys(): + # TODO: we may want to add diff of probs too. + rollout_old_log_probs = batch.batch["rollout_log_probs"] + actor_old_log_probs = batch.batch["old_log_probs"] + attention_mask = batch.batch["attention_mask"] + responses = batch.batch["responses"] + response_length = responses.size(1) + response_mask = attention_mask[:, -response_length:] + + rollout_probs = torch.exp(rollout_old_log_probs) + actor_probs = torch.exp(actor_old_log_probs) + rollout_probs_diff = torch.abs(rollout_probs - actor_probs) + rollout_probs_diff = torch.masked_select(rollout_probs_diff, response_mask.bool()) + rollout_probs_diff_max = torch.max(rollout_probs_diff) + rollout_probs_diff_mean = torch.mean(rollout_probs_diff) + rollout_probs_diff_std = torch.std(rollout_probs_diff) + metrics.update( + { + "training/rollout_probs_diff_max": rollout_probs_diff_max.detach().item(), + "training/rollout_probs_diff_mean": rollout_probs_diff_mean.detach().item(), + "training/rollout_probs_diff_std": rollout_probs_diff_std.detach().item(), + } + ) + + if self.use_reference_policy: + # compute reference log_prob + with marked_timer("ref", timing_raw, color="olive"): + if not self.ref_in_actor: + ref_log_prob = self.ref_policy_wg.compute_ref_log_prob(batch) + else: + ref_log_prob = self.actor_rollout_wg.compute_ref_log_prob(batch) + batch = batch.union(ref_log_prob) + + # compute values + if self.use_critic: + with marked_timer("values", timing_raw, color="cyan"): + values = self.critic_wg.compute_values(batch) + batch = batch.union(values) + + with marked_timer("adv", timing_raw, color="brown"): + # we combine with rule-based rm + reward_extra_infos_dict: dict[str, list] + if self.config.reward_model.launch_reward_fn_async: + reward_tensor, reward_extra_infos_dict = ray.get(future_reward) + batch.batch["token_level_scores"] = reward_tensor + + if reward_extra_infos_dict: + batch.non_tensor_batch.update({k: np.array(v) for k, v in reward_extra_infos_dict.items()}) + + # compute rewards. apply_kl_penalty if available + if self.config.algorithm.use_kl_in_reward: + batch, kl_metrics = apply_kl_penalty( + batch, kl_ctrl=self.kl_ctrl_in_reward, kl_penalty=self.config.algorithm.kl_penalty + ) + metrics.update(kl_metrics) + else: + batch.batch["token_level_rewards"] = batch.batch["token_level_scores"] + + # compute advantages, executed on the driver process + + norm_adv_by_std_in_grpo = self.config.algorithm.get( + "norm_adv_by_std_in_grpo", True + ) # GRPO adv normalization factor + + batch = compute_advantage( + batch, + adv_estimator=self.config.algorithm.adv_estimator, + gamma=self.config.algorithm.gamma, + lam=self.config.algorithm.lam, + num_repeat=self.config.actor_rollout_ref.rollout.n, + norm_adv_by_std_in_grpo=norm_adv_by_std_in_grpo, + config=self.config.algorithm, + ) + + # update critic + if self.use_critic: + with marked_timer("update_critic", timing_raw, color="pink"): + critic_output = self.critic_wg.update_critic(batch) + critic_output_metrics = reduce_metrics(critic_output.meta_info["metrics"]) + metrics.update(critic_output_metrics) + + # implement critic warmup + if self.config.trainer.critic_warmup <= self.global_steps: + # update actor + with marked_timer("update_actor", timing_raw, color="red"): + batch.meta_info["multi_turn"] = self.config.actor_rollout_ref.rollout.multi_turn.enable + actor_output = self.actor_rollout_wg.update_actor(batch) + actor_output_metrics = reduce_metrics(actor_output.meta_info["metrics"]) + metrics.update(actor_output_metrics) + + # Log rollout generations if enabled + rollout_data_dir = self.config.trainer.get("rollout_data_dir", None) + if rollout_data_dir: + with marked_timer("dump_rollout_generations", timing_raw, color="green"): + inputs = self.tokenizer.batch_decode(batch.batch["prompts"], skip_special_tokens=True) + outputs = self.tokenizer.batch_decode(batch.batch["responses"], skip_special_tokens=True) + scores = batch.batch["token_level_scores"].sum(-1).cpu().tolist() + if "request_id" in batch.non_tensor_batch: + reward_extra_infos_dict.setdefault( + "request_id", + batch.non_tensor_batch["request_id"].tolist(), + ) + self._dump_generations( + inputs=inputs, + outputs=outputs, + scores=scores, + reward_extra_infos_dict=reward_extra_infos_dict, + dump_path=rollout_data_dir, + ) + + # validate + if ( + self.val_reward_fn is not None + and self.config.trainer.test_freq > 0 + and (is_last_step or self.global_steps % self.config.trainer.test_freq == 0) + ): + with marked_timer("testing", timing_raw, color="green"): + val_metrics: dict = self._validate() + if is_last_step: + last_val_metrics = val_metrics + metrics.update(val_metrics) + + # Check if the ESI (Elastic Server Instance)/training plan is close to expiration. + esi_close_to_expiration = should_save_ckpt_esi( + max_steps_duration=self.max_steps_duration, + redundant_time=self.config.trainer.esi_redundant_time, + ) + # Check if the conditions for saving a checkpoint are met. + # The conditions include a mandatory condition (1) and + # one of the following optional conditions (2/3/4): + # 1. The save frequency is set to a positive value. + # 2. It's the last training step. + # 3. The current step number is a multiple of the save frequency. + # 4. The ESI(Elastic Server Instance)/training plan is close to expiration. + if self.config.trainer.save_freq > 0 and ( + is_last_step + or self.global_steps % self.config.trainer.save_freq == 0 + or esi_close_to_expiration + ): + if esi_close_to_expiration: + print("Force saving checkpoint: ESI instance expiration approaching.") + with marked_timer("save_checkpoint", timing_raw, color="green"): + self._save_checkpoint() + + with marked_timer("stop_profile", timing_raw): + self._stop_profiling(do_profile) + + steps_duration = timing_raw["step"] + self.max_steps_duration = max(self.max_steps_duration, steps_duration) + + # training metrics + metrics.update( + { + "training/global_step": self.global_steps, + "training/epoch": epoch, + } + ) + # collect metrics + metrics.update(compute_data_metrics(batch=batch, use_critic=self.use_critic)) + metrics.update(compute_timing_metrics(batch=batch, timing_raw=timing_raw)) + # TODO: implement actual tflpo and theoretical tflpo + n_gpus = self.resource_pool_manager.get_n_gpus() + metrics.update(compute_throughout_metrics(batch=batch, timing_raw=timing_raw, n_gpus=n_gpus)) - self._stop_profiling(do_profile, timing_raw) - self._collect_metrics(batch, epoch, metrics, timing_raw) - self._post_batch_processing(batch) + # this is experimental and may be changed/removed in the future in favor of a general-purpose one + if isinstance(self.train_dataloader.sampler, AbstractCurriculumSampler): + self.train_dataloader.sampler.update(batch=batch) # TODO: make a canonical logger that supports various backend logger.log(data=metrics, step=self.global_steps) @@ -1190,245 +1361,8 @@ def fit(self): progress_bar.close() return - def _prepare_generate_batch(self, batch_dict): - batch: DataProto = DataProto.from_single_dict(batch_dict) - # pop those keys for generation - batch_keys_to_pop = ["input_ids", "attention_mask", "position_ids"] - non_tensor_batch_keys_to_pop = ["raw_prompt_ids"] - if "multi_modal_data" in batch.non_tensor_batch: - non_tensor_batch_keys_to_pop.append("multi_modal_data") - if "raw_prompt" in batch.non_tensor_batch: - non_tensor_batch_keys_to_pop.append("raw_prompt") - if "tools_kwargs" in batch.non_tensor_batch: - non_tensor_batch_keys_to_pop.append("tools_kwargs") - if "interaction_kwargs" in batch.non_tensor_batch: - non_tensor_batch_keys_to_pop.append("interaction_kwargs") - if "index" in batch.non_tensor_batch: - non_tensor_batch_keys_to_pop.append("index") - if "agent_name" in batch.non_tensor_batch: - non_tensor_batch_keys_to_pop.append("agent_name") - gen_batch = batch.pop( - batch_keys=batch_keys_to_pop, - non_tensor_batch_keys=non_tensor_batch_keys_to_pop, - ) - # pass global_steps to trace - gen_batch.meta_info["global_steps"] = self.global_steps - gen_batch = gen_batch.repeat(repeat_times=self.config.actor_rollout_ref.rollout.n, interleave=True) - return batch, gen_batch - - def _post_generate_batch(self, batch, gen_batch_output, metrics): - batch.non_tensor_batch["uid"] = np.array([str(uuid.uuid4()) for _ in range(len(batch.batch))], dtype=object) - # repeat to align with repeated responses in rollout - batch = batch.repeat(repeat_times=self.config.actor_rollout_ref.rollout.n, interleave=True) - batch = batch.union(gen_batch_output) - if "response_mask" not in batch.batch.keys(): - batch.batch["response_mask"] = compute_response_mask(batch) - # Balance the number of valid tokens across DP ranks. - # NOTE: This usually changes the order of data in the `batch`, - # which won't affect the advantage calculation (since it's based on uid), - # but might affect the loss calculation (due to the change of mini-batching). - # TODO: Decouple the DP balancing and mini-batching. - if self.config.trainer.balance_batch: - self._balance_batch(batch, metrics=metrics) - # compute global_valid tokens - batch.meta_info["global_token_num"] = torch.sum(batch.batch["attention_mask"], dim=-1).tolist() - return batch - - def _process_batch_common(self, batch, metrics, timing_raw): - with marked_timer("reward", timing_raw, color="yellow"): - # compute reward model score - if self.use_rm: - reward_tensor = self.rm_wg.compute_rm_score(batch) - batch = batch.union(reward_tensor) - - if self.config.reward_model.launch_reward_fn_async: - future_reward = compute_reward_async.remote(data=batch, reward_fn=self.reward_fn) - else: - reward_tensor, reward_extra_infos_dict = compute_reward(batch, self.reward_fn) - # recompute old_log_probs - with marked_timer("old_log_prob", timing_raw, color="blue"): - async_training = self.config.get("async_training", None) - if async_training and async_training.use_rollout_log_probs: - batch.batch["old_log_probs"] = batch.batch["rollout_log_probs"] - batch.meta_info["temperature"] = self.config.actor_rollout_ref.rollout.temperature - - else: - old_log_prob = self.actor_rollout_wg.compute_log_prob(batch) - entropys = old_log_prob.batch["entropys"] - response_masks = batch.batch["response_mask"] - loss_agg_mode = self.config.actor_rollout_ref.actor.loss_agg_mode - entropy_agg = agg_loss(loss_mat=entropys, loss_mask=response_masks, loss_agg_mode=loss_agg_mode) - old_log_prob_metrics = {"actor/entropy": entropy_agg.detach().item()} - metrics.update(old_log_prob_metrics) - old_log_prob.batch.pop("entropys") - batch = batch.union(old_log_prob) - - if "rollout_log_probs" in batch.batch.keys(): - # TODO: we may want to add diff of probs too. - rollout_old_log_probs = batch.batch["rollout_log_probs"] - actor_old_log_probs = batch.batch["old_log_probs"] - attention_mask = batch.batch["attention_mask"] - responses = batch.batch["responses"] - response_length = responses.size(1) - response_mask = attention_mask[:, -response_length:] - - rollout_probs = torch.exp(rollout_old_log_probs) - actor_probs = torch.exp(actor_old_log_probs) - rollout_probs_diff = torch.abs(rollout_probs - actor_probs) - rollout_probs_diff = torch.masked_select(rollout_probs_diff, response_mask.bool()) - rollout_probs_diff_max = torch.max(rollout_probs_diff) - rollout_probs_diff_mean = torch.mean(rollout_probs_diff) - rollout_probs_diff_std = torch.std(rollout_probs_diff) - metrics.update( - { - "training/rollout_probs_diff_max": rollout_probs_diff_max.detach().item(), - "training/rollout_probs_diff_mean": rollout_probs_diff_mean.detach().item(), - "training/rollout_probs_diff_std": rollout_probs_diff_std.detach().item(), - } - ) - - if self.use_reference_policy: - # compute reference log_prob - with marked_timer("ref", timing_raw, color="olive"): - if not self.ref_in_actor: - ref_log_prob = self.ref_policy_wg.compute_ref_log_prob(batch) - else: - ref_log_prob = self.actor_rollout_wg.compute_ref_log_prob(batch) - batch = batch.union(ref_log_prob) - # compute values - if self.use_critic: - with marked_timer("values", timing_raw, color="cyan"): - values = self.critic_wg.compute_values(batch) - batch = batch.union(values) - with marked_timer("adv", timing_raw, color="brown"): - # we combine with rule-based rm - reward_extra_infos_dict: dict[str, list] - if self.config.reward_model.launch_reward_fn_async: - reward_tensor, reward_extra_infos_dict = ray.get(future_reward) - batch.batch["token_level_scores"] = reward_tensor - - if reward_extra_infos_dict: - batch.non_tensor_batch.update({k: np.array(v) for k, v in reward_extra_infos_dict.items()}) - - # compute rewards. apply_kl_penalty if available - if self.config.algorithm.use_kl_in_reward: - batch, kl_metrics = apply_kl_penalty( - batch, kl_ctrl=self.kl_ctrl_in_reward, kl_penalty=self.config.algorithm.kl_penalty - ) - metrics.update(kl_metrics) - else: - batch.batch["token_level_rewards"] = batch.batch["token_level_scores"] - - # compute advantages, executed on the driver process - - norm_adv_by_std_in_grpo = self.config.algorithm.get( - "norm_adv_by_std_in_grpo", True - ) # GRPO adv normalization factor - - batch = compute_advantage( - batch, - adv_estimator=self.config.algorithm.adv_estimator, - gamma=self.config.algorithm.gamma, - lam=self.config.algorithm.lam, - num_repeat=self.config.actor_rollout_ref.rollout.n, - norm_adv_by_std_in_grpo=norm_adv_by_std_in_grpo, - config=self.config.algorithm, - ) - # update critic - if self.use_critic: - with marked_timer("update_critic", timing_raw, color="pink"): - critic_output = self.critic_wg.update_critic(batch) - critic_output_metrics = reduce_metrics(critic_output.meta_info["metrics"]) - metrics.update(critic_output_metrics) - # implement critic warmup - if self.config.trainer.critic_warmup <= self.global_steps: - # update actor - with marked_timer("update_actor", timing_raw, color="red"): - batch.meta_info["multi_turn"] = self.config.actor_rollout_ref.rollout.multi_turn.enable - actor_output = self.actor_rollout_wg.update_actor(batch) - actor_output_metrics = reduce_metrics(actor_output.meta_info["metrics"]) - metrics.update(actor_output_metrics) - return batch, reward_extra_infos_dict - - def _log_rollout(self, batch, reward_extra_infos_dict, timing_raw): - """Log rollout generations if enabled""" - rollout_data_dir = self.config.trainer.get("rollout_data_dir", None) - if rollout_data_dir: - with marked_timer("dump_rollout_generations", timing_raw, color="green"): - inputs = self.tokenizer.batch_decode(batch.batch["prompts"], skip_special_tokens=True) - outputs = self.tokenizer.batch_decode(batch.batch["responses"], skip_special_tokens=True) - scores = batch.batch["token_level_scores"].sum(-1).cpu().tolist() - if "request_id" in batch.non_tensor_batch: - reward_extra_infos_dict.setdefault( - "request_id", - batch.non_tensor_batch["request_id"].tolist(), - ) - self._dump_generations( - inputs=inputs, - outputs=outputs, - scores=scores, - reward_extra_infos_dict=reward_extra_infos_dict, - dump_path=rollout_data_dir, - ) - - def _validate_metrics(self, is_last_step, last_val_metrics, metrics, timing_raw): - if ( - self.val_reward_fn is not None - and self.config.trainer.test_freq > 0 - and (is_last_step or self.global_steps % self.config.trainer.test_freq == 0) - ): - with marked_timer("testing", timing_raw, color="green"): - val_metrics: dict = self._validate() - if is_last_step: - last_val_metrics = val_metrics - metrics.update(val_metrics) - return last_val_metrics - - def _check_save_checkpoint(self, is_last_step, timing_raw): - # Check if the ESI (Elastic Server Instance)/training plan is close to expiration. - esi_close_to_expiration = should_save_ckpt_esi( - max_steps_duration=self.max_steps_duration, - redundant_time=self.config.trainer.esi_redundant_time, - ) - # Check if the conditions for saving a checkpoint are met. - # The conditions include a mandatory condition (1) and - # one of the following optional conditions (2/3/4): - # 1. The save frequency is set to a positive value. - # 2. It's the last training step. - # 3. The current step number is a multiple of the save frequency. - # 4. The ESI(Elastic Server Instance)/training plan is close to expiration. - if self.config.trainer.save_freq > 0 and ( - is_last_step or self.global_steps % self.config.trainer.save_freq == 0 or esi_close_to_expiration - ): - if esi_close_to_expiration: - print("Force saving checkpoint: ESI instance expiration approaching.") - with marked_timer("save_checkpoint", timing_raw, color="green"): - self._save_checkpoint() - - def _collect_metrics(self, batch, epoch, metrics, timing_raw): - steps_duration = timing_raw["step"] - self.max_steps_duration = max(self.max_steps_duration, steps_duration) - # training metrics - metrics.update( - { - "training/global_step": self.global_steps, - "training/epoch": epoch, - } - ) - # collect metrics - metrics.update(compute_data_metrics(batch=batch, use_critic=self.use_critic)) - metrics.update(compute_timing_metrics(batch=batch, timing_raw=timing_raw)) - # TODO: implement actual tflpo and theoretical tflpo - n_gpus = self.resource_pool_manager.get_n_gpus() - metrics.update(compute_throughout_metrics(batch=batch, timing_raw=timing_raw, n_gpus=n_gpus)) - - def _post_batch_processing(self, batch: DataProto): - # this is experimental and may be changed/removed in the future in favor of a general-purpose one - if isinstance(self.train_dataloader.sampler, AbstractCurriculumSampler): - self.train_dataloader.sampler.update(batch=batch) - - # this is experimental and may be changed/removed in the future - # in favor of a general-purpose data buffer pool - if hasattr(self.train_dataset, "on_batch_end"): - # The dataset may be changed after each training batch - self.train_dataset.on_batch_end(batch=batch) + # this is experimental and may be changed/removed in the future + # in favor of a general-purpose data buffer pool + if hasattr(self.train_dataset, "on_batch_end"): + # The dataset may be changed after each training batch + self.train_dataset.on_batch_end(batch=batch) \ No newline at end of file diff --git a/verl/workers/rollout/vllm_rollout/vllm_async_server.py b/verl/workers/rollout/vllm_rollout/vllm_async_server.py index 4826ebaa1d0..5125ab41f8b 100644 --- a/verl/workers/rollout/vllm_rollout/vllm_async_server.py +++ b/verl/workers/rollout/vllm_rollout/vllm_async_server.py @@ -11,11 +11,10 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -import asyncio import logging import os import pickle -from typing import Any, Callable, Optional, Sequence +from typing import Any, Callable, Optional import ray import zmq @@ -207,12 +206,6 @@ def __init__(self, config: DictConfig, vllm_dp_size: int, vllm_dp_rank: int, wg_ self.wg_prefix = wg_prefix self.engine: AsyncLLM = None - # for cancel LLMServer - self.paused = False - self.lock = asyncio.Lock() - self.cancel_event: dict[str, asyncio.Event] = {} - self.req_output: dict[str, Optional[RequestOutput]] = {} - async def init_engine(self): """Init vLLM AsyncLLM engine.""" config = self.config @@ -334,60 +327,6 @@ async def generate(self, prompt_ids: list[int], sampling_params: dict[str, Any], return final_res.outputs[0].token_ids - async def _generate_step(self, prompt_ids: list[int], sampling_params: dict[str, Any], request_id: str): - max_tokens = self.max_model_len - len(prompt_ids) - sampling_params = SamplingParams(max_tokens=max_tokens, logprobs=1, **sampling_params) - prompt = TokensPrompt(prompt_token_ids=prompt_ids) - generator = self.engine.generate(prompt=prompt, sampling_params=sampling_params, request_id=request_id) - - # Get final response - self.req_output[request_id]: Optional[RequestOutput] = None - async for output in generator: - self.req_output[request_id] = output - assert self.req_output[request_id] is not None - - async def generate_for_partial( - self, prompt_ids: list[int], sampling_params: dict[str, Any], request_id: str - ) -> tuple[list[Any], list[Any], bool] | tuple[Sequence[int], list[float], Any]: - # 设置中断标志 - async with self.lock: - if self.paused: - # cancel 后, 所有任务直接返回,等待下次提交 - return [], [], True - self.cancel_event[request_id] = asyncio.Event() - cancel_handle = asyncio.create_task(self.cancel_event[request_id].wait()) - generation_handle = asyncio.create_task(self._generate_step(prompt_ids, sampling_params, request_id)) - - done, pend = await asyncio.wait([generation_handle, cancel_handle], return_when=asyncio.FIRST_COMPLETED) - - for task in done: - await task - - for task in pend: - task.cancel() - - async with self.lock: - token_ids = self.req_output[request_id].outputs[0].token_ids - log_probs: list[float] = [] - for i, x in enumerate(self.req_output[request_id].outputs[0].logprobs): - # sampling_params 中 logprobs 设置为1,应该返回1个, 但是实测会有多个,取token_id所对应的log_prob - token_id = self.req_output[request_id].outputs[0].token_ids[i] - log_probs.append(x[token_id].logprob) - is_cancel = generation_handle not in done - self.cancel_event.pop(request_id, None) - self.req_output.pop(request_id, None) - return token_ids, log_probs, is_cancel - - async def cancel(self): - async with self.lock: - self.paused = True - for request_id in self.cancel_event: - self.cancel_event[request_id].set() - - async def resume(self): - async with self.lock: - self.paused = False - async def wake_up(self): if self.config.rollout.free_cache_engine: await self.engine.wake_up() @@ -396,4 +335,4 @@ async def sleep(self): # TODO: https://github.com/vllm-project/vllm/issues/17103 await self.engine.reset_prefix_cache() if self.config.rollout.free_cache_engine: - await self.engine.sleep() + await self.engine.sleep() \ No newline at end of file From 6cf1da1101c8269fadae3416c5f60455b0e4cd57 Mon Sep 17 00:00:00 2001 From: wangshulin02 <953550366@qq.com> Date: Mon, 15 Sep 2025 19:57:46 +0800 Subject: [PATCH 133/182] ruff format --- recipe/fully_async_policy/agent_loop/__init__.py | 2 +- recipe/fully_async_policy/agent_loop/agent_loop.py | 6 ++++-- recipe/fully_async_policy/agent_loop/vllm_async_server.py | 2 -- recipe/fully_async_policy/fully_async_main.py | 2 +- verl/experimental/agent_loop/__init__.py | 2 +- verl/experimental/agent_loop/agent_loop.py | 2 +- verl/trainer/main_ppo.py | 2 +- verl/trainer/ppo/ray_trainer.py | 2 +- verl/workers/rollout/vllm_rollout/vllm_async_server.py | 2 +- 9 files changed, 11 insertions(+), 11 deletions(-) diff --git a/recipe/fully_async_policy/agent_loop/__init__.py b/recipe/fully_async_policy/agent_loop/__init__.py index 7e583cb220d..0796a0c3f5e 100644 --- a/recipe/fully_async_policy/agent_loop/__init__.py +++ b/recipe/fully_async_policy/agent_loop/__init__.py @@ -18,4 +18,4 @@ _ = [SingleTurnAgentLoop, PartialSingleTurnAgentLoop] -__all__ = ["AgentLoopBase", "AgentLoopManager"] \ No newline at end of file +__all__ = ["AgentLoopBase", "AgentLoopManager"] diff --git a/recipe/fully_async_policy/agent_loop/agent_loop.py b/recipe/fully_async_policy/agent_loop/agent_loop.py index 4e6c9ff9285..4f4496c8999 100644 --- a/recipe/fully_async_policy/agent_loop/agent_loop.py +++ b/recipe/fully_async_policy/agent_loop/agent_loop.py @@ -670,8 +670,9 @@ async def resume_async(self): await asyncio.gather(*[asyncio.wrap_future(future.future()) for future in futures], return_exceptions=True) - from verl.workers.rollout.async_server import AsyncServerBase + + def async_server_class( rollout_backend: str, rollout_backend_module: Optional[str] = None, rollout_backend_class: Optional[str] = None ) -> type[AsyncServerBase]: @@ -692,6 +693,7 @@ def async_server_class( if rollout_backend == "vllm": from recipe.fully_async_policy.agent_loop.vllm_async_server import AsyncvLLMServer + return AsyncvLLMServer else: raise NotImplementedError(f"rollout backend {rollout_backend} is not supported") @@ -701,4 +703,4 @@ def async_server_class( from verl.utils.import_utils import load_extern_type - return load_extern_type(rollout_backend_module, rollout_backend_class) \ No newline at end of file + return load_extern_type(rollout_backend_module, rollout_backend_class) diff --git a/recipe/fully_async_policy/agent_loop/vllm_async_server.py b/recipe/fully_async_policy/agent_loop/vllm_async_server.py index 03fc28c8549..4826ebaa1d0 100644 --- a/recipe/fully_async_policy/agent_loop/vllm_async_server.py +++ b/recipe/fully_async_policy/agent_loop/vllm_async_server.py @@ -397,5 +397,3 @@ async def sleep(self): await self.engine.reset_prefix_cache() if self.config.rollout.free_cache_engine: await self.engine.sleep() - - diff --git a/recipe/fully_async_policy/fully_async_main.py b/recipe/fully_async_policy/fully_async_main.py index 699222f350a..79bdc4114db 100644 --- a/recipe/fully_async_policy/fully_async_main.py +++ b/recipe/fully_async_policy/fully_async_main.py @@ -194,7 +194,7 @@ def _initialize_components(self, config) -> None: print(f"total_train_steps {total_train_steps}") ray.get(self.components["trainer"].set_total_train_steps.remote(total_train_steps)) - # max_queue_size + # max_queue_size max_queue_size = ray.get(self.components["rollouter"].get_max_queue_size.remote()) print(f"[ASYNC MAIN] Creating MessageQueue... max_queue_size {max_queue_size}") message_queue = MessageQueue.remote(config, max_queue_size) diff --git a/verl/experimental/agent_loop/__init__.py b/verl/experimental/agent_loop/__init__.py index c6f58f83c83..a39171db764 100644 --- a/verl/experimental/agent_loop/__init__.py +++ b/verl/experimental/agent_loop/__init__.py @@ -18,4 +18,4 @@ _ = [SingleTurnAgentLoop, ToolAgentLoop] -__all__ = ["AgentLoopBase", "AgentLoopManager"] \ No newline at end of file +__all__ = ["AgentLoopBase", "AgentLoopManager"] diff --git a/verl/experimental/agent_loop/agent_loop.py b/verl/experimental/agent_loop/agent_loop.py index 4639229a3b0..ef86381020b 100644 --- a/verl/experimental/agent_loop/agent_loop.py +++ b/verl/experimental/agent_loop/agent_loop.py @@ -540,4 +540,4 @@ def wake_up(self): def sleep(self): """Sleep all rollout server instances.""" - ray.get([server.sleep.remote() for server in self.async_llm_servers]) \ No newline at end of file + ray.get([server.sleep.remote() for server in self.async_llm_servers]) diff --git a/verl/trainer/main_ppo.py b/verl/trainer/main_ppo.py index 8d2b811c733..a9ea554687a 100644 --- a/verl/trainer/main_ppo.py +++ b/verl/trainer/main_ppo.py @@ -334,4 +334,4 @@ def create_rl_sampler(data_config, dataset): if __name__ == "__main__": - main() \ No newline at end of file + main() diff --git a/verl/trainer/ppo/ray_trainer.py b/verl/trainer/ppo/ray_trainer.py index 05281ebe3f9..6a82a4bcf2b 100644 --- a/verl/trainer/ppo/ray_trainer.py +++ b/verl/trainer/ppo/ray_trainer.py @@ -1365,4 +1365,4 @@ def fit(self): # in favor of a general-purpose data buffer pool if hasattr(self.train_dataset, "on_batch_end"): # The dataset may be changed after each training batch - self.train_dataset.on_batch_end(batch=batch) \ No newline at end of file + self.train_dataset.on_batch_end(batch=batch) diff --git a/verl/workers/rollout/vllm_rollout/vllm_async_server.py b/verl/workers/rollout/vllm_rollout/vllm_async_server.py index 5125ab41f8b..988dac407d7 100644 --- a/verl/workers/rollout/vllm_rollout/vllm_async_server.py +++ b/verl/workers/rollout/vllm_rollout/vllm_async_server.py @@ -335,4 +335,4 @@ async def sleep(self): # TODO: https://github.com/vllm-project/vllm/issues/17103 await self.engine.reset_prefix_cache() if self.config.rollout.free_cache_engine: - await self.engine.sleep() \ No newline at end of file + await self.engine.sleep() From aa370b4cea5dfa76bd9a5fb3c751b216b6000da1 Mon Sep 17 00:00:00 2001 From: wangshulin02 <953550366@qq.com> Date: Mon, 15 Sep 2025 20:33:38 +0800 Subject: [PATCH 134/182] add anomaly detection and exit --- recipe/fully_async_policy/fully_async_main.py | 34 +++++++++++++++---- 1 file changed, 28 insertions(+), 6 deletions(-) diff --git a/recipe/fully_async_policy/fully_async_main.py b/recipe/fully_async_policy/fully_async_main.py index 79bdc4114db..f41ab2df826 100644 --- a/recipe/fully_async_policy/fully_async_main.py +++ b/recipe/fully_async_policy/fully_async_main.py @@ -267,15 +267,37 @@ def _create_trainer(self, config) -> None: def _run_training_loop(self): self.running = True - print("[ASYNC MAIN] Starting Rollouter in background...") + print("[ASYNC MAIN] Starting Rollouter and Trainer...") rollouter_future = self.components["rollouter"].fit.remote() trainer_future = self.components["trainer"].fit.remote() - ray.get(rollouter_future) - ray.get(trainer_future) - - self.components["message_queue_client"].clear_queue() - print("[ASYNC MAIN] Training completed or interrupted") + futures = [rollouter_future, trainer_future] + + try: + while futures: + # Use ray.wait to monitor all futures and return when any one is completed. + done_futures, remaining_futures = ray.wait(futures, num_returns=1, timeout=None) + + for future in done_futures: + try: + ray.get(future) + print(f"[ASYNC MAIN] One component completed successfully") + except Exception as e: + print(f"[ASYNC MAIN] Component failed with error: {e}") + for remaining_future in remaining_futures: + ray.cancel(remaining_future) + raise e + + futures = remaining_futures + + except Exception as e: + print(f"[ASYNC MAIN] Training failed: {e}") + for future in futures: + ray.cancel(future) + raise + finally: + self.components["message_queue_client"].clear_queue() + print("[ASYNC MAIN] Training completed or interrupted") @hydra.main(config_path="config", config_name="fully_async_ppo_trainer", version_base=None) From 0a2763dea59f8cbb1e0566eb0d7a1bfc9346048c Mon Sep 17 00:00:00 2001 From: ArronHZG Date: Tue, 16 Sep 2025 11:13:15 +0800 Subject: [PATCH 135/182] qwen3-32b-96-32 --- .../fsdp2_fully-async_96-32/run.sh | 153 ++++++++++++++++++ .../fsdp2_fully-async_96-32/runtime_env.yaml | 4 + 2 files changed, 157 insertions(+) create mode 100644 recipe/fully_async_policy/exp/qwen2-32B_128/fsdp2_fully-async_96-32/run.sh create mode 100644 recipe/fully_async_policy/exp/qwen2-32B_128/fsdp2_fully-async_96-32/runtime_env.yaml diff --git a/recipe/fully_async_policy/exp/qwen2-32B_128/fsdp2_fully-async_96-32/run.sh b/recipe/fully_async_policy/exp/qwen2-32B_128/fsdp2_fully-async_96-32/run.sh new file mode 100644 index 00000000000..827e9a30e41 --- /dev/null +++ b/recipe/fully_async_policy/exp/qwen2-32B_128/fsdp2_fully-async_96-32/run.sh @@ -0,0 +1,153 @@ +#!/usr/bin/env bash +set -xeuo pipefail + +project_name='DAPO' +exp_name='dapo_qwen2-32B_20k_fsdp2_fully-async_96-32-tps1' + +# Paths +MODEL_PATH=/mnt/dolphinfs/ssd_pool/docker/user/hadoop-friday-studio/.friday/models/Qwen2.5-32B +CKPTS_DIR=./ckpts/${project_name}/${exp_name} +TRAIN_FILE=/mnt/dolphinfs/ssd_pool/docker/user/hadoop-friday-studio/FTI/houzhenggang/data/dapo/dapo-math-17k.parquet +TEST_FILE=/mnt/dolphinfs/ssd_pool/docker/user/hadoop-friday-studio/FTI/houzhenggang/data/dapo/aime-2024.parquet + +rollout_mode="async" +rollout_name="vllm" # sglang or vllm +if [ "$rollout_mode" = "async" ]; then + export VLLM_USE_V1=1 + return_raw_chat="True" +fi + +# Algorithm parameters +adv_estimator=grpo + +use_kl_in_reward=False +kl_coef=0.0 +use_kl_loss=False +kl_loss_coef=0.0 + +clip_ratio_low=0.2 +clip_ratio_high=0.28 + +# Response length parameters +max_prompt_length=$((1024 * 2)) +max_response_length=$((1024 * 20)) +enable_overlong_buffer=True +overlong_buffer_len=$((1024 * 4)) +overlong_penalty_factor=1.0 + +# Training parameters +loss_agg_mode="token-mean" + +# Algorithm +temperature=1.0 +top_p=1.0 +top_k=-1 # 0 for HF rollout, -1 for vLLM rollout +val_top_p=0.7 + +# Performance Related Parameter +use_dynamic_bsz=True +actor_ppo_max_token_len=$(((max_prompt_length + max_response_length) * 2)) +infer_ppo_max_token_len=$(((max_prompt_length + max_response_length) * 3)) +ref_offload=True +actor_offload=False +gen_tp=4 +sp_size=8 +fsdp_size=-1 + +# Fully async specific parameters +NNODES_ROLLOUT=${NNODES_ROLLOUT:-12} +NNODES_TRAIN=${NNODES_TRAIN:-4} +NGPUS_PER_NODE=${NGPUS_PER_NODE:-8} + +train_prompt_bsz=0 +gen_prompt_bsz=1 +n_resp_per_prompt=16 +train_prompt_mini_bsz=128 +total_rollout_steps=$(((512*400))) +test_freq=20 +staleness_threshold=0.1 +trigger_parameter_sync_step=2 +partial_rollout=True + +python -m recipe.fully_async_policy.fully_async_main \ + data.train_files="${TRAIN_FILE}" \ + data.val_files="${TEST_FILE}" \ + data.prompt_key=prompt \ + data.truncation='left' \ + data.max_prompt_length=${max_prompt_length} \ + data.max_response_length=${max_response_length} \ + data.train_batch_size=${train_prompt_bsz} \ + data.gen_batch_size=${gen_prompt_bsz} \ + data.return_raw_chat=${return_raw_chat} \ + actor_rollout_ref.rollout.n=${n_resp_per_prompt} \ + algorithm.adv_estimator=${adv_estimator} \ + algorithm.use_kl_in_reward=${use_kl_in_reward} \ + algorithm.kl_ctrl.kl_coef=${kl_coef} \ + actor_rollout_ref.actor.strategy=fsdp2 \ + critic.strategy=fsdp2 \ + actor_rollout_ref.actor.use_kl_loss=${use_kl_loss} \ + actor_rollout_ref.actor.kl_loss_coef=${kl_loss_coef} \ + actor_rollout_ref.actor.clip_ratio_low=${clip_ratio_low} \ + actor_rollout_ref.actor.clip_ratio_high=${clip_ratio_high} \ + actor_rollout_ref.actor.clip_ratio_c=10.0 \ + actor_rollout_ref.model.use_remove_padding=True \ + actor_rollout_ref.hybrid_engine=False \ + +actor_rollout_ref.model.override_config.max_position_embeddings=32768 \ + actor_rollout_ref.actor.use_dynamic_bsz=${use_dynamic_bsz} \ + actor_rollout_ref.ref.log_prob_use_dynamic_bsz=${use_dynamic_bsz} \ + actor_rollout_ref.rollout.log_prob_use_dynamic_bsz=${use_dynamic_bsz} \ + actor_rollout_ref.actor.ppo_max_token_len_per_gpu=${actor_ppo_max_token_len} \ + actor_rollout_ref.ref.log_prob_max_token_len_per_gpu=${infer_ppo_max_token_len} \ + actor_rollout_ref.rollout.log_prob_max_token_len_per_gpu=${infer_ppo_max_token_len} \ + actor_rollout_ref.model.path="${MODEL_PATH}" \ + actor_rollout_ref.actor.optim.lr=1e-6 \ + actor_rollout_ref.actor.optim.lr_warmup_steps=10 \ + actor_rollout_ref.actor.optim.weight_decay=0.1 \ + actor_rollout_ref.actor.ppo_mini_batch_size=${train_prompt_mini_bsz} \ + actor_rollout_ref.actor.fsdp_config.param_offload=${actor_offload} \ + actor_rollout_ref.actor.fsdp_config.optimizer_offload=${actor_offload} \ + actor_rollout_ref.actor.entropy_coeff=0 \ + actor_rollout_ref.actor.grad_clip=1.0 \ + actor_rollout_ref.actor.loss_agg_mode=${loss_agg_mode} \ + actor_rollout_ref.actor.ulysses_sequence_parallel_size=${sp_size} \ + actor_rollout_ref.rollout.gpu_memory_utilization=0.80 \ + actor_rollout_ref.rollout.tensor_model_parallel_size=${gen_tp} \ + actor_rollout_ref.rollout.enable_chunked_prefill=True \ + actor_rollout_ref.rollout.max_num_batched_tokens=$((max_prompt_length + max_response_length)) \ + actor_rollout_ref.rollout.temperature=${temperature} \ + actor_rollout_ref.rollout.top_p=${top_p} \ + actor_rollout_ref.rollout.top_k=${top_k} \ + actor_rollout_ref.rollout.val_kwargs.temperature=${temperature} \ + actor_rollout_ref.rollout.val_kwargs.top_p=${val_top_p} \ + actor_rollout_ref.rollout.val_kwargs.top_k=${top_k} \ + actor_rollout_ref.rollout.val_kwargs.do_sample=True \ + actor_rollout_ref.rollout.val_kwargs.n=1 \ + actor_rollout_ref.ref.fsdp_config.param_offload=${ref_offload} \ + actor_rollout_ref.ref.ulysses_sequence_parallel_size=${sp_size} \ + actor_rollout_ref.actor.fsdp_config.fsdp_size=${fsdp_size} \ + actor_rollout_ref.rollout.name=${rollout_name} \ + actor_rollout_ref.rollout.mode=${rollout_mode} \ + actor_rollout_ref.rollout.calculate_log_probs=True \ + reward_model.reward_manager=dapo \ + +reward_model.reward_kwargs.overlong_buffer_cfg.enable=${enable_overlong_buffer} \ + +reward_model.reward_kwargs.overlong_buffer_cfg.len=${overlong_buffer_len} \ + +reward_model.reward_kwargs.overlong_buffer_cfg.penalty_factor=${overlong_penalty_factor} \ + +reward_model.reward_kwargs.overlong_buffer_cfg.log=False \ + +reward_model.reward_kwargs.max_resp_len=${max_response_length} \ + trainer.logger=['console','tensorboard'] \ + trainer.project_name="${project_name}" \ + trainer.experiment_name="${exp_name}" \ + trainer.val_before_train=True \ + trainer.save_freq=-1 \ + trainer.default_local_dir="${CKPTS_DIR}" \ + trainer.resume_mode=auto \ + trainer.nnodes="${NNODES_TRAIN}" \ + trainer.n_gpus_per_node="${NGPUS_PER_NODE}" \ + rollout.nnodes="${NNODES_ROLLOUT}" \ + rollout.n_gpus_per_node="${NGPUS_PER_NODE}" \ + rollout.total_rollout_steps="${total_rollout_steps}" \ + rollout.total_epochs=10 \ + rollout.test_freq="${test_freq}" \ + async_training.staleness_threshold="${staleness_threshold}" \ + async_training.trigger_parameter_sync_step="${trigger_parameter_sync_step}" \ + async_training.partial_rollout="${partial_rollout}" diff --git a/recipe/fully_async_policy/exp/qwen2-32B_128/fsdp2_fully-async_96-32/runtime_env.yaml b/recipe/fully_async_policy/exp/qwen2-32B_128/fsdp2_fully-async_96-32/runtime_env.yaml new file mode 100644 index 00000000000..be4ab6a6349 --- /dev/null +++ b/recipe/fully_async_policy/exp/qwen2-32B_128/fsdp2_fully-async_96-32/runtime_env.yaml @@ -0,0 +1,4 @@ +env_vars: + VLLM_USE_V1: "1" + TENSORBOARD_DIR: "/mnt/dolphinfs/ssd_pool/docker/user/hadoop-friday-studio/FTI/houzhenggang/expeirments/qwen2-32B-128/dapo_qwen2-32B_20k_fsdp2_fully-async_96-32-tps1" + HYDRA_FULL_ERROR: "1" \ No newline at end of file From dd534e09c6ba09e895c6733b7d6bebc3448b13ce Mon Sep 17 00:00:00 2001 From: wangshulin02 <953550366@qq.com> Date: Tue, 16 Sep 2025 15:16:04 +0800 Subject: [PATCH 136/182] add rollouter&trainer idle time --- recipe/fully_async_policy/detach_utils.py | 8 ++++- .../fully_async_rollouter.py | 32 ++++++++++++++----- .../fully_async_policy/fully_async_trainer.py | 31 ++++++++++-------- recipe/fully_async_policy/param_sync.py | 4 +-- 4 files changed, 50 insertions(+), 25 deletions(-) diff --git a/recipe/fully_async_policy/detach_utils.py b/recipe/fully_async_policy/detach_utils.py index 75d67ec1ab1..ad12ef69057 100644 --- a/recipe/fully_async_policy/detach_utils.py +++ b/recipe/fully_async_policy/detach_utils.py @@ -52,7 +52,7 @@ class RolloutSample: @dataclass class ValidateMetrics: timing_raw: dict[str, Any] - metrics: dict[str, Any] + metrics: Optional[dict[str, Any]] = None global_steps: Optional[int] = None param_version: Optional[int] = None @@ -362,14 +362,20 @@ def get_aggregated_metrics(self) -> dict[str, Any]: def _special_metrics_aggergate(self, aggregated: dict[str, Any]) -> dict[str, Any]: """calculate special metrics""" + # global_seqlen/minmax_diff if "global_seqlen/minmax_diff" in aggregated.keys(): aggregated["global_seqlen/minmax_diff"] = aggregated["global_seqlen/max"] - aggregated["global_seqlen/min"] + # perf/throughput REQUIRED_PERF_KEYS = {"perf/throughput", "perf/total_num_tokens", "perf/time_per_step"} if REQUIRED_PERF_KEYS.issubset(aggregated): aggregated["perf/throughput"] = aggregated["perf/total_num_tokens"] / ( aggregated["perf/time_per_step"] * self.total_gpus ) + + # trainer/idle_ratio + if "timing_s/gen" in aggregated.keys() and "timing_s/step" in aggregated.keys(): + aggregated["trainer/idle_ratio"] = aggregated["timing_s/gen"] / aggregated["timing_s/step"] return aggregated diff --git a/recipe/fully_async_policy/fully_async_rollouter.py b/recipe/fully_async_policy/fully_async_rollouter.py index 919314ba1b5..2134e6d0e38 100644 --- a/recipe/fully_async_policy/fully_async_rollouter.py +++ b/recipe/fully_async_policy/fully_async_rollouter.py @@ -127,6 +127,8 @@ def __init__( self.dropped_stale_samples = 0 self.processed_sample_count = 0 # 已处理的样本计数 self.global_steps = 0 + self.idle_start_time = None + self.version_start_time = None # Concurrency control self.paused = False @@ -203,24 +205,37 @@ async def update_param_version(self, version: int, validate: bool = False, globa + self.cancel_queue.qsize() + await self.message_queue_client.get_queue_size() ) + timing_raw = {} + idle_ratio = None + if self.idle_start_time is not None and self.version_start_time is not None: + rollout_active_time = self.idle_start_time - self.version_start_time + rollout_version_time = time.time() - self.version_start_time + idle_ratio = 1 - rollout_active_time / rollout_version_time + timing_raw["rollouter/active_time"] = rollout_active_time + timing_raw["rollouter/version_time"] = rollout_version_time + timing_raw["rollouter/idle_ratio"] = idle_ratio + self.idle_start_time = None print( f"[FullyAsyncRollouter][Public][update_param_version] " f"Parameter version updated from {old_version} to {version} " f",reset staleness_samples to: {self.staleness_samples}" + f",idle_ratio: {idle_ratio}" ) - timing_raw = {} + val_metrics = None if ( self.val_reward_fn is not None and self.config.rollout.test_freq > 0 and self.current_param_version % self.config.rollout.test_freq == 0 and self.current_param_version > 0 # don't test here in the initial parameter sync ) or (validate and self.val_reward_fn is not None): - with marked_timer("testing", timing_raw, color="green"): + with marked_timer("rollouter/validate_time", timing_raw, color="green"): val_metrics: dict = self._validate() - data = ValidateMetrics( - timing_raw=timing_raw, metrics=val_metrics, global_steps=global_steps, param_version=version - ) - await self.message_queue_client.put_validate(ray.cloudpickle.dumps(data)) + data = ValidateMetrics( + timing_raw=timing_raw, metrics=val_metrics, global_steps=global_steps, param_version=version + ) + await self.message_queue_client.put_validate(ray.cloudpickle.dumps(data)) + + self.version_start_time = time.time() def _validate_config(self): # Validate asynchronous training configuration @@ -320,6 +335,8 @@ async def _processor_worker(self): # self.paused 由 pause() 和 self._should_pause_generation() 负责修改 if self.paused or await self._should_pause_generation(): print("[FullyAsyncRollouter][Processor] 收到暂停信号,等待剩余任务完成...") + async with self.lock: + self.paused = True while self.active_tasks: async with self.lock: # 获取锁后,active_tasks 数量会发生变化,需要再次校验 @@ -329,11 +346,10 @@ async def _processor_worker(self): ) for task in done_tasks: await task - async with self.lock: - self.paused = True async with self.lock: while self.paused: + self.idle_start_time = time.time() await self.condition.wait() # 获取待处理的部分 RolloutSample diff --git a/recipe/fully_async_policy/fully_async_trainer.py b/recipe/fully_async_policy/fully_async_trainer.py index 0c1501cbf89..66d96c4b09b 100644 --- a/recipe/fully_async_policy/fully_async_trainer.py +++ b/recipe/fully_async_policy/fully_async_trainer.py @@ -263,13 +263,14 @@ def fit(self): # get validate data before training if self.config.trainer.val_before_train and self.reward_fn is not None: - ray.get(self.param_synchronizer.wait_last_sync.remote()) + ray.get(self.param_synchronizer.wait_last_valid.remote()) val_data = self.message_queue_client.get_validate_sync() if val_data: val_data: ValidateMetrics = ray.cloudpickle.loads(val_data) - self.logger.log(data=val_data.metrics, step=val_data.param_version) - self.logger.log(data=val_data.timing_raw, step=val_data.param_version) - pprint(f"[FullyAsyncTrainer] Initial validation metrics: {val_data.metrics}") + if val_data.metrics: + self.logger.log(data=val_data.metrics, step=val_data.param_version) + pprint(f"[FullyAsyncTrainer] Initial validation metrics: {val_data.metrics}") + self.logger.log(data=val_data.timing_raw, step=val_data.param_version) # Use queue mode, no need for traditional dataloader iterator # Initialize to get the first batch of data @@ -320,24 +321,26 @@ def fit(self): val_data = self.message_queue_client.get_validate_sync() if val_data: val_data: ValidateMetrics = ray.cloudpickle.loads(val_data) - self.logger.log(data=val_data.metrics, step=val_data.param_version) + if val_data.metrics: + self.logger.log(data=val_data.metrics, step=val_data.param_version) + pprint( + f"[FullyAsyncTrainer] parameter version: {val_data.param_version} \ + Validation metrics: {val_data.metrics}" + ) self.logger.log(data=val_data.timing_raw, step=val_data.param_version) - pprint( - f"[FullyAsyncTrainer] parameter version: {val_data.param_version} \ - Validation metrics: {val_data.metrics}" - ) self.global_steps += 1 # final parameter sync and validate - if val_data is None: + if val_data is None or val_data.metrics is None: self._trigger_parameter_sync_after_step(validate=True, global_steps=self.global_steps - 1) - ray.get(self.param_synchronizer.wait_last_sync.remote()) + ray.get(self.param_synchronizer.wait_last_valid.remote()) val_data = self.message_queue_client.get_validate_sync() if val_data: val_data: ValidateMetrics = ray.cloudpickle.loads(val_data) - self.logger.log(data=val_data.metrics, step=val_data.param_version) + if val_data.metrics: + self.logger.log(data=val_data.metrics, step=val_data.param_version) + pprint(f"[FullyAsyncTrainer] Final validation metrics: {val_data.metrics}") self.logger.log(data=val_data.timing_raw, step=val_data.param_version) - pprint(f"[FullyAsyncTrainer] Final validation metrics: {val_data.metrics}") else: pprint(f"[FullyAsyncTrainer] Final validation metrics: {val_data.metrics}") self.progress_bar.close() @@ -364,7 +367,7 @@ def _trigger_parameter_sync_after_step(self, validate: bool = False, global_step ) self.progress_bar.update(1) self.metrics_aggregator.reset() - ray.get(self.param_synchronizer.wait_last_sync.remote()) + ray.get(self.param_synchronizer.wait_last_valid.remote()) ray.get( self.param_synchronizer.sync_weights.remote( self.current_param_version, validate=validate, global_steps=global_steps diff --git a/recipe/fully_async_policy/param_sync.py b/recipe/fully_async_policy/param_sync.py index 2a58292ff78..55d11d236c0 100644 --- a/recipe/fully_async_policy/param_sync.py +++ b/recipe/fully_async_policy/param_sync.py @@ -95,8 +95,8 @@ def sync_weights(self, version, validate=False, global_steps=0): self.wait_last_update = self.rollouter.update_param_version.remote(version, validate, global_steps) self.wait_last_resume = self.rollouter.resume.remote() - def wait_last_sync(self): - print("[ParameterSynchronizer] waiting last parameter sync and validate...") + def wait_last_valid(self): + print("[ParameterSynchronizer] waiting last validate...") start_time = time.time() if self.wait_last_update: ray.get(self.wait_last_update) From 67de99f199493852390026feec3a34fe28344475 Mon Sep 17 00:00:00 2001 From: ArronHZG Date: Tue, 16 Sep 2025 15:37:18 +0800 Subject: [PATCH 137/182] refactor code rm megatron code --- hzg_test/name_ip.py | 21 ++ .../agent_loop/agent_loop.py | 3 +- recipe/fully_async_policy/fsdp_workers.py | 24 +-- recipe/fully_async_policy/fully_async_main.py | 15 +- recipe/fully_async_policy/megatron_workers.py | 200 ------------------ .../vllm_rollout/__init__.py | 0 .../vllm_async_server.py | 0 tests/special_e2e/run_fully_async_policy.sh | 6 +- 8 files changed, 39 insertions(+), 230 deletions(-) create mode 100644 hzg_test/name_ip.py delete mode 100644 recipe/fully_async_policy/megatron_workers.py create mode 100644 recipe/fully_async_policy/vllm_rollout/__init__.py rename recipe/fully_async_policy/{agent_loop => vllm_rollout}/vllm_async_server.py (100%) diff --git a/hzg_test/name_ip.py b/hzg_test/name_ip.py new file mode 100644 index 00000000000..d47b0890d91 --- /dev/null +++ b/hzg_test/name_ip.py @@ -0,0 +1,21 @@ +import ray + +# 初始化Ray +if not ray.is_initialized(): + ray.init() + +# 获取所有节点的信息 +nodes = ray.nodes() + +# 打印表头 +print(f"{'机器名':<20} {'IP地址':<15}") +print("-" * 40) + +# 遍历所有节点并打印信息 +for node in nodes: + # 节点地址格式通常为 "IP:端口",我们只需要IP部分 + ip_address = node["NodeManagerAddress"].split(":")[0] + # 机器名(主机名) + node_name = node["NodeManagerHostname"] + + print(f"{node_name:<20} {ip_address:<15}") \ No newline at end of file diff --git a/recipe/fully_async_policy/agent_loop/agent_loop.py b/recipe/fully_async_policy/agent_loop/agent_loop.py index 4f4496c8999..4da6b562c4c 100644 --- a/recipe/fully_async_policy/agent_loop/agent_loop.py +++ b/recipe/fully_async_policy/agent_loop/agent_loop.py @@ -692,8 +692,7 @@ def async_server_class( # importlib.import_module and from ... import ... have subtle differences in ray if rollout_backend == "vllm": - from recipe.fully_async_policy.agent_loop.vllm_async_server import AsyncvLLMServer - + from recipe.fully_async_policy.vllm_rollout.vllm_async_server import AsyncvLLMServer return AsyncvLLMServer else: raise NotImplementedError(f"rollout backend {rollout_backend} is not supported") diff --git a/recipe/fully_async_policy/fsdp_workers.py b/recipe/fully_async_policy/fsdp_workers.py index 41fa3a55eec..7a1b59aa64c 100644 --- a/recipe/fully_async_policy/fsdp_workers.py +++ b/recipe/fully_async_policy/fsdp_workers.py @@ -99,6 +99,18 @@ def sync_rollout_weights(self): inference_model.load_weights([(key, tensor)]) get_torch_device().empty_cache() + +class DetachActorWorker(DetachNcclSync): + def _get_actor_params(self): + assert self._is_actor + params = self.actor_module_fsdp.state_dict() + from verl.utils.model import convert_weight_keys + + params = convert_weight_keys( + params, getattr(self.actor_module_fsdp, "_fsdp_wrapped_module", self.actor_module_fsdp) + ) + return params + @register(dispatch_mode=Dispatch.ONE_TO_ALL) def get_actor_weights_info(self): assert self._is_actor @@ -120,18 +132,6 @@ def get_actor_weights_info(self): return ret -class DetachActorWorker(DetachNcclSync): - def _get_actor_params(self): - assert self._is_actor - params = self.actor_module_fsdp.state_dict() - from verl.utils.model import convert_weight_keys - - params = convert_weight_keys( - params, getattr(self.actor_module_fsdp, "_fsdp_wrapped_module", self.actor_module_fsdp) - ) - return params - - class DetachRolloutWorker(DetachNcclSync): def __init__(self, config: DictConfig, role: str): Worker.__init__(self) diff --git a/recipe/fully_async_policy/fully_async_main.py b/recipe/fully_async_policy/fully_async_main.py index f41ab2df826..c0f156296a2 100644 --- a/recipe/fully_async_policy/fully_async_main.py +++ b/recipe/fully_async_policy/fully_async_main.py @@ -90,17 +90,7 @@ def create_role_worker_mapping(config): ray_worker_group_cls = RayWorkerGroup - elif config.actor_rollout_ref.actor.strategy == "megatron": - assert config.actor_rollout_ref.actor.strategy == config.critic.strategy - from recipe.fully_async_policy.megatron_workers import ( - CriticWorker, - DetachActorWorker, - DetachAsyncRolloutWorker, - ) - from verl.single_controller.ray.megatron import NVMegatronRayWorkerGroup - - ray_worker_group_cls = NVMegatronRayWorkerGroup - + # TODO megatron support else: raise NotImplementedError(f"Unsupported strategy: {config.actor_rollout_ref.actor.strategy}") @@ -113,8 +103,7 @@ def create_role_worker_mapping(config): if config.reward_model.enable: if config.reward_model.strategy == "fsdp2": from verl.workers.fsdp_workers import RewardModelWorker - elif config.reward_model.strategy == "megatron": - from verl.workers.megatron_workers import RewardModelWorker + # TODO megatron support else: raise NotImplementedError(f"Unsupported reward model strategy: {config.reward_model.strategy}") diff --git a/recipe/fully_async_policy/megatron_workers.py b/recipe/fully_async_policy/megatron_workers.py deleted file mode 100644 index a9318b8f7b3..00000000000 --- a/recipe/fully_async_policy/megatron_workers.py +++ /dev/null @@ -1,200 +0,0 @@ -# Copyright 2025 Bytedance Ltd. and/or its affiliates -# Copyright 2025 Meituan Ltd. and/or its affiliates -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import logging -import os - -import torch -import torch.distributed -from omegaconf import DictConfig, OmegaConf - -from verl.single_controller.base.decorator import Dispatch, register -from verl.utils.debug import ( - log_gpu_memory_usage, -) -from verl.utils.device import get_device_name, get_torch_device -from verl.utils.fs import copy_to_local -from verl.utils.vllm_utils import patch_vllm_moe_model_weight_loader -from verl.workers.megatron_workers import ( - ActorRolloutRefWorker, - AsyncActorRolloutRefWorker, - CriticWorker, -) - -logger = logging.getLogger(__file__) -logger.setLevel(os.getenv("VERL_LOGGING_LEVEL", "WARN")) - -__all__ = ["DetachActorWorker", "DetachRolloutWorker", "DetachAsyncRolloutWorker", "CriticWorker"] - - -class DetachNcclSync(ActorRolloutRefWorker): - def _get_actor_params_generator(self): - pass - - @register(dispatch_mode=Dispatch.ONE_TO_ALL, blocking=False) - def sync_rollout_weights(self): - assert (self._is_actor or self._is_rollout) and not self.config.hybrid_engine - assert hasattr(self, "_weights_info") and self._weights_info is not None - - params_generator = self._get_actor_params_generator() if self._is_actor else None - if self._is_rollout: - inference_model = ( - self.rollout.inference_engine.llm_engine.model_executor.driver_worker.worker.model_runner.model - ) - patch_vllm_moe_model_weight_loader(inference_model) - for key, shape, dtype in self._weights_info: - if self._is_actor: - weight_key, weight = next(params_generator) - assert key == weight_key - assert shape == weight.size() - assert dtype == weight.dtype - - tensor = torch.empty(shape, dtype=dtype, device=get_torch_device().current_device()) - if self._is_actor and torch.distributed.get_rank() == 0: - tensor.copy_(weight) - from ray.util.collective import collective - - collective.broadcast(tensor, src_rank=0, group_name="actor_rollout") - if self._is_rollout: - inference_model.load_weights([(key, tensor)]) - - @register(dispatch_mode=Dispatch.ONE_TO_ALL) - def get_actor_weights_info(self): - assert self._is_actor - if hasattr(self, "_weights_info"): - return self._weights_info - - params_generator = self._get_actor_params_generator() - ret = [] - for key, tensor in params_generator: - ret.append((key, tensor.size(), tensor.dtype)) - - self._weights_info = ret - return ret - - -class DetachActorWorker(DetachNcclSync): - def _get_actor_params_generator(self): - assert self._is_actor - from verl.models.mcore import get_mcore_weight_converter - from verl.utils.megatron_utils import per_tensor_generator - - layer_name_mapping = { - "qkv_layer_name": "self_attention.linear_qkv.", - "gate_proj_layer_name": "linear_fc1.", - } - weight_converter = get_mcore_weight_converter(self.actor_model_config, self.dtype) - generator = per_tensor_generator( - self.actor.actor_module, - self.actor_model_config, - weight_converter, - self.tf_config, - layer_name_mapping, - ) - return generator - - -class DetachRolloutWorker(DetachNcclSync): - @register(dispatch_mode=Dispatch.ONE_TO_ALL) - def init_model(self): - if self.config.model.get("external_lib", None) is not None: - # This is used to import external_lib into the huggingface systems - import importlib - - importlib.import_module(self.config.model.external_lib) - - from verl.utils.torch_dtypes import PrecisionType - - override_model_config = OmegaConf.to_container(OmegaConf.create(self.config.model.get("override_config", {}))) - override_transformer_config = {} - self.param_dtype = torch.bfloat16 - self.dtype = PrecisionType.to_dtype(self.param_dtype) - trust_remote_code = self.config.model.get("trust_remote_code", False) - - from verl.utils.model import get_generation_config - - self._init_hf_config_and_tf_config( - self.config.model.path, - self.config.model.path, - self.dtype, - override_model_config, - override_transformer_config, - trust_remote_code, - ) - self.generation_config = get_generation_config(self.local_path) - - from torch.distributed.device_mesh import init_device_mesh - - assert self.config.rollout.name == "vllm" - - from verl.workers.rollout.vllm_rollout import vLLMRollout - - # NOTE(sgm): If the QKV and gate_up projection layer are concate together in actor, - # we will reorganize their weight format when resharding from actor to rollout. - - infer_tp = self.config.rollout.tensor_model_parallel_size - dp = self.world_size // infer_tp - assert self.world_size % infer_tp == 0, ( - f"rollout world_size: {self.world_size} is not divisible by infer_tp: {infer_tp}" - ) - rollout_device_mesh = init_device_mesh( - get_device_name(), mesh_shape=(dp, infer_tp), mesh_dim_names=["dp", "infer_tp"] - ) - log_gpu_memory_usage("Before building vllm rollout", logger=None) - - local_path = copy_to_local(self.config.model.path, use_shm=self.config.model.get("use_shm", False)) - from verl.workers.rollout.vllm_rollout import vLLMAsyncRollout - - vllm_rollout_cls = vLLMRollout if self.config.rollout.mode == "sync" else vLLMAsyncRollout - rollout = vllm_rollout_cls( - model_path=local_path, - config=self.config.rollout, - tokenizer=self.tokenizer, - model_hf_config=self.hf_config, - device_mesh=rollout_device_mesh, - trust_remote_code=trust_remote_code, - ) - log_gpu_memory_usage("After building vllm rollout", logger=logger) - - from .detach_sharding_manager import DetachShardingManager - - rollout_sharding_manager = DetachShardingManager( - inference_engine=rollout.inference_engine, device_mesh=rollout_device_mesh - ) - - log_gpu_memory_usage("After building sharding manager", logger=logger) - - self.rollout = rollout - self.sharding_manager = rollout_sharding_manager - self.rollout.sharding_manager = rollout_sharding_manager - - @register(dispatch_mode=Dispatch.DP_COMPUTE_PROTO, blocking=False) - def async_generate_sequences(self, *args, **kwargs): - return super().generate_sequences(*args, **kwargs) - - @register(dispatch_mode=Dispatch.ONE_TO_ALL) - def set_actor_weights_info(self, weights_info): - assert self._is_rollout - self._weights_info = weights_info - - -class DetachAsyncRolloutWorker(AsyncActorRolloutRefWorker, DetachRolloutWorker): - def __init__(self, config: DictConfig, role: str): - print(DetachAsyncRolloutWorker.__mro__) - DetachRolloutWorker.__init__(self, config, role) - - @register(dispatch_mode=Dispatch.ONE_TO_ALL) - def init_model(self): - DetachRolloutWorker.init_model(self) diff --git a/recipe/fully_async_policy/vllm_rollout/__init__.py b/recipe/fully_async_policy/vllm_rollout/__init__.py new file mode 100644 index 00000000000..e69de29bb2d diff --git a/recipe/fully_async_policy/agent_loop/vllm_async_server.py b/recipe/fully_async_policy/vllm_rollout/vllm_async_server.py similarity index 100% rename from recipe/fully_async_policy/agent_loop/vllm_async_server.py rename to recipe/fully_async_policy/vllm_rollout/vllm_async_server.py diff --git a/tests/special_e2e/run_fully_async_policy.sh b/tests/special_e2e/run_fully_async_policy.sh index 4813f159696..2ddc61910ba 100644 --- a/tests/special_e2e/run_fully_async_policy.sh +++ b/tests/special_e2e/run_fully_async_policy.sh @@ -49,8 +49,8 @@ top_k=-1 val_top_p=0.7 # Fully async specific parameters -n_gpus_rollout=4 -n_gpus_training=$((NUM_GPUS - n_gpus_rollout)) +n_gpus_rollout=1 +n_gpus_training=1 train_prompt_bsz=0 gen_prompt_bsz=1 @@ -118,7 +118,7 @@ common_params=( trainer.logger=['console'] trainer.project_name='verl-test-fully-async' trainer.experiment_name="${exp_name}" - trainer.val_before_train=True + trainer.val_before_train=False trainer.save_freq=-1 trainer.resume_mode=disable trainer.nnodes=1 From a66c4cf252a42a2a5209b1fffca005f2d23a9dbb Mon Sep 17 00:00:00 2001 From: wangshulin02 <953550366@qq.com> Date: Tue, 16 Sep 2025 15:37:49 +0800 Subject: [PATCH 138/182] set required_samples=ppo_mini_bs & set max_concurrent_samples=rollout_dp_size*16 --- recipe/fully_async_policy/fully_async_rollouter.py | 7 +------ recipe/fully_async_policy/fully_async_trainer.py | 10 +--------- 2 files changed, 2 insertions(+), 15 deletions(-) diff --git a/recipe/fully_async_policy/fully_async_rollouter.py b/recipe/fully_async_policy/fully_async_rollouter.py index 2134e6d0e38..8fbed0f0b65 100644 --- a/recipe/fully_async_policy/fully_async_rollouter.py +++ b/recipe/fully_async_policy/fully_async_rollouter.py @@ -165,12 +165,7 @@ async def set_required_samples(self, required_samples: int): ) # 单次最多扔一次更新需要的样本 - self.max_concurrent_samples = int( - self.config.actor_rollout_ref.actor.ppo_mini_batch_size - / self.config.actor_rollout_ref.rollout.n - * self.async_rollout_manager.rollout_dp_size - * 8 - ) + self.max_concurrent_samples = self.async_rollout_manager.rollout_dp_size * 16 self.max_concurrent_samples = min(self.max_concurrent_samples, self.max_required_samples) self.max_queue_size = self.max_required_samples diff --git a/recipe/fully_async_policy/fully_async_trainer.py b/recipe/fully_async_policy/fully_async_trainer.py index 66d96c4b09b..0f0c35d7db5 100644 --- a/recipe/fully_async_policy/fully_async_trainer.py +++ b/recipe/fully_async_policy/fully_async_trainer.py @@ -113,15 +113,7 @@ def __init__( self.trigger_parameter_sync_step = config.async_training.trigger_parameter_sync_step # calculate required_samples - ppo_mini_batch_size = config.actor_rollout_ref.actor.ppo_mini_batch_size - rollout_n = config.actor_rollout_ref.rollout.n - if ppo_mini_batch_size % rollout_n != 0: - raise ValueError( - f"PPO mini batch size ({ppo_mini_batch_size}) must be divisible by rollout n ({rollout_n})" - ) - self.required_samples = int( - self.minimal_bsz * config.actor_rollout_ref.actor.ppo_mini_batch_size / config.actor_rollout_ref.rollout.n - ) + self.required_samples = config.actor_rollout_ref.actor.ppo_mini_batch_size total_gpus = ( config.trainer.nnodes * config.trainer.n_gpus_per_node + config.rollout.nnodes * config.rollout.n_gpus_per_node From 0ae200ec7b9a86bf77160118471d7d3b9e1dfa3e Mon Sep 17 00:00:00 2001 From: ArronHZG Date: Tue, 16 Sep 2025 15:39:45 +0800 Subject: [PATCH 139/182] rm code --- hzg_test/name_ip.py | 21 --------------------- 1 file changed, 21 deletions(-) delete mode 100644 hzg_test/name_ip.py diff --git a/hzg_test/name_ip.py b/hzg_test/name_ip.py deleted file mode 100644 index d47b0890d91..00000000000 --- a/hzg_test/name_ip.py +++ /dev/null @@ -1,21 +0,0 @@ -import ray - -# 初始化Ray -if not ray.is_initialized(): - ray.init() - -# 获取所有节点的信息 -nodes = ray.nodes() - -# 打印表头 -print(f"{'机器名':<20} {'IP地址':<15}") -print("-" * 40) - -# 遍历所有节点并打印信息 -for node in nodes: - # 节点地址格式通常为 "IP:端口",我们只需要IP部分 - ip_address = node["NodeManagerAddress"].split(":")[0] - # 机器名(主机名) - node_name = node["NodeManagerHostname"] - - print(f"{node_name:<20} {ip_address:<15}") \ No newline at end of file From 9cfacc2bd32ce1ae03556726351b72cdcd61c042 Mon Sep 17 00:00:00 2001 From: ArronHZG Date: Tue, 16 Sep 2025 20:03:11 +0800 Subject: [PATCH 140/182] refactor 1 --- .../fully_async_policy/agent_loop/__init__.py | 7 +- .../agent_loop/agent_loop.py | 599 +-------- recipe/fully_async_policy/detach_utils.py | 83 +- recipe/fully_async_policy/fsdp_workers.py | 30 +- recipe/fully_async_policy/fully_async_main.py | 7 +- .../fully_async_rollouter.py | 11 +- .../fully_async_policy/fully_async_trainer.py | 11 +- recipe/fully_async_policy/main_ppo.py | 344 ------ recipe/fully_async_policy/ray_trainer.py | 1069 ++--------------- verl/experimental/agent_loop/__init__.py | 4 +- verl/experimental/agent_loop/agent_loop.py | 59 +- verl/trainer/main_ppo.py | 10 +- verl/trainer/ppo/ray_trainer.py | 30 +- verl/trainer/ppo/utils.py | 31 + 14 files changed, 337 insertions(+), 1958 deletions(-) delete mode 100644 recipe/fully_async_policy/main_ppo.py diff --git a/recipe/fully_async_policy/agent_loop/__init__.py b/recipe/fully_async_policy/agent_loop/__init__.py index 0796a0c3f5e..5f059078964 100644 --- a/recipe/fully_async_policy/agent_loop/__init__.py +++ b/recipe/fully_async_policy/agent_loop/__init__.py @@ -12,10 +12,5 @@ # See the License for the specific language governing permissions and # limitations under the License. -from .agent_loop import AgentLoopBase, AgentLoopManager from .partial_single_turn_agent_loop import PartialSingleTurnAgentLoop -from .single_turn_agent_loop import SingleTurnAgentLoop - -_ = [SingleTurnAgentLoop, PartialSingleTurnAgentLoop] - -__all__ = ["AgentLoopBase", "AgentLoopManager"] +_ = [PartialSingleTurnAgentLoop] diff --git a/recipe/fully_async_policy/agent_loop/agent_loop.py b/recipe/fully_async_policy/agent_loop/agent_loop.py index 4da6b562c4c..38c461629dc 100644 --- a/recipe/fully_async_policy/agent_loop/agent_loop.py +++ b/recipe/fully_async_policy/agent_loop/agent_loop.py @@ -12,97 +12,34 @@ # See the License for the specific language governing permissions and # limitations under the License. import asyncio -import heapq import logging import os -import random -from abc import ABC, abstractmethod from typing import Any, Optional import hydra import numpy as np import ray import torch -from cachetools import LRUCache from omegaconf import DictConfig, OmegaConf -from pydantic import BaseModel from tensordict import TensorDict -from transformers import AutoTokenizer from verl.protocol import DataProto from verl.single_controller.ray.base import RayWorkerGroup from verl.utils import hf_tokenizer from verl.utils.fs import copy_to_local -from verl.utils.rollout_trace import RolloutTraceConfig, rollout_trace_attr, rollout_trace_op +from verl.utils.rollout_trace import RolloutTraceConfig, rollout_trace_attr +from verl.workers.rollout.replica import TokenOutput + +from verl.experimental.agent_loop.agent_loop import AgentLoopOutput, _agent_loop_registry, _DummyConfig logger = logging.getLogger(__file__) logger.setLevel(os.getenv("VERL_LOGGING_LEVEL", "WARN")) +from verl.experimental.agent_loop.agent_loop import * -class AsyncLLMServerManager: - """ - A class to manage multiple OpenAI compatible LLM servers. This class provides - - Load balance: least requests load balancing - - Sticky session: send multi-turn chat completions to same server for automatic prefix caching - """ - - def __init__(self, config: DictConfig, server_handles: list[ray.actor.ActorHandle], max_cache_size: int = 10000): - """Initialize the AsyncLLMServerManager. - - Args: - config (DictConfig): YAML config. - server_handles (List[ray.actor.ActorHandle]): OpenAI compatible LLM server actor handles. - max_cache_size (int, optional): max cache size for request_id to server mapping. Defaults to 10000. - """ - self.config = config - self.server_handles = server_handles - random.shuffle(self.server_handles) - - # Least requests load balancing - self.weighted_serveres = [[0, (hash(server), server)] for server in server_handles] - heapq.heapify(self.weighted_serveres) - - # LRU cache to map request_id to server - self.request_id_to_server = LRUCache(maxsize=max_cache_size) - - def _choose_server(self, request_id: str) -> ray.actor.ActorHandle: - # TODO: implement server pressure awareness load balancing - if request_id in self.request_id_to_server: - return self.request_id_to_server[request_id] - - server = self.weighted_serveres[0][1][1] - self.weighted_serveres[0][0] += 1 - heapq.heapreplace(self.weighted_serveres, self.weighted_serveres[0]) - self.request_id_to_server[request_id] = server - return server - - @rollout_trace_op - async def generate( - self, - request_id, - *, - prompt_ids: list[int], - sampling_params: dict[str, Any], - ) -> list[int]: - """Generate tokens from prompt ids. - - Args: - request_id (str): request id for sticky session. - prompt_ids (List[int]): List of prompt token ids. - sampling_params (Dict[str, Any]): Sampling parameters for the chat completion. - - Returns: - List[int]: List of generated token ids. - """ - server = self._choose_server(request_id) - output = await server.generate.remote( - request_id=request_id, - prompt_ids=prompt_ids, - sampling_params=sampling_params, - ) - return output - async def generate_for_partial(self, request_id, prompt_ids, sampling_params): +class PartialAsyncLLMServerManager(AsyncLLMServerManager): + async def generate_for_partial(self, request_id, prompt_ids, sampling_params) -> TokenOutput: """Generate tokens from prompt ids. with partial rollout function""" server = self._choose_server(request_id) output = await server.generate_for_partial.remote( @@ -113,275 +50,25 @@ async def generate_for_partial(self, request_id, prompt_ids, sampling_params): return output -class AgentLoopMetrics(BaseModel): - """Agent loop performance metrics.""" - - generate_sequences: float = 0.0 - tool_calls: float = 0.0 - - -class AgentLoopOutput(BaseModel): +class PartialAgentLoopOutput(AgentLoopOutput): """Agent loop output.""" - prompt_ids: list[int] - """Prompt token ids.""" - response_ids: list[int] - """Response token ids including LLM generated token, tool response token.""" - response_mask: list[int] - """Response mask, 1 for LLM generated token, 0 for tool response token.""" - num_turns: int = 0 - """Number of chat turns, including user, assistant, tool.""" - metrics: AgentLoopMetrics - """Auxiliary performance metrics""" is_cancel: bool = False """Indicates whether the request was interrupted""" log_probs: list[float] = None """Response token log probs including LLM generated token, tool response token.""" -# make hydra.utils.instantiate happy -class _DummyConfig: - def __init__(self, config: DictConfig) -> None: - self.config = config - - -class AgentLoopBase(ABC): - """An agent loop takes a input message, chat with OpenAI compatible LLM server and interact with various - environments.""" - - _class_initialized = False - +@ray.remote +class FullyAgentLoopWorker(AgentLoopWorker): def __init__( - self, trainer_config: _DummyConfig, server_manager: AsyncLLMServerManager, tokenizer: AutoTokenizer, **kwargs + self, config: DictConfig, server_handles: list[ray.actor.ActorHandle], rm_executor: BatchExecutor = None ): - """Initialize agent loop, each sample will have its own loop instance. - - Args: - trainer_config (_DummyConfig): trainer config. - server_manager (AsyncLLMServerManager): OpenAI compatible LLM server manager. - tokenizer (AutoTokenizer): Tokenizer for tokenize messages. - """ - self.init_class(trainer_config.config, tokenizer, **kwargs) - self.config = trainer_config.config - self.server_manager = server_manager - self.tokenizer = tokenizer - self.loop = asyncio.get_running_loop() - - @classmethod - def init_class(cls, config: DictConfig, tokenizer: AutoTokenizer, **kwargs): - """This is used to do heavy initialization work that should shared across all instances. It's only called once. - - Args: - config (DictConfig): trainer config. - tokenizer (AutoTokenizer): Tokenizer for tokenize messages. - **kwargs: extra kwargs from config file passed in by `hydra.utils.instantiate`. - """ - if cls._class_initialized: - return - cls._class_initialized = True - - @abstractmethod - async def run(self, messages: list[dict[str, Any]], sampling_params: dict[str, Any]) -> AgentLoopOutput: - """Run agent loop to interact with LLM server and environment. - - Args: - messages (List[Dict[str, Any]]): Input messages. - sampling_params (Dict[str, Any]): LLM sampling params. - - Returns: - AgentLoopOutput: Agent loop output. - """ - raise NotImplementedError - - -"""Agent loop registry: key is agent_name, value is a dict of agent loop config -used by hydra.utils.instantiate to initialize agent loop instance. - -https://hydra.cc/docs/advanced/instantiate_objects/overview/ -""" -_agent_loop_registry: dict[str, dict] = {} - - -def register(agent_name: str): - """Register agent loop class.""" - - def decorator(subclass: type[AgentLoopBase]) -> type[AgentLoopBase]: - fqdn = f"{subclass.__module__}.{subclass.__qualname__}" - _agent_loop_registry[agent_name] = {"_target_": fqdn} - return subclass - - return decorator - - -def postprocess_agent_loop_outputs(inputs: list[AgentLoopOutput], tokenizer, config) -> DataProto: - """Static method to postprocess a list of AgentLoopOutput into DataProto - - Args: - inputs: List of AgentLoopOutput - tokenizer: Tokenizer instance - config: Configuration object - - Returns: - DataProto: Processed batch data - """ - # NOTE: consistent with batch version of generate_sequences in vllm_rollout_spmd.py - # prompts: left pad - # responses: right pad - # input_ids: prompt + response - # attention_mask: [0,0,0,0,1,1,1,1, | 1,1,1,0,0,0,0,0] - # position_ids: [0,0,0,0,0,1,2,3, | 4,5,6,7,8,9,10,11] - - # prompts - tokenizer.padding_side = "left" - outputs = tokenizer.pad( - [{"input_ids": input.prompt_ids} for input in inputs], - padding="max_length", - max_length=config.actor_rollout_ref.rollout.prompt_length, - return_tensors="pt", - return_attention_mask=True, - ) - prompt_ids, prompt_attention_mask = outputs["input_ids"], outputs["attention_mask"] - - # responses - tokenizer.padding_side = "right" - outputs = tokenizer.pad( - [{"input_ids": input.response_ids} for input in inputs], - padding="max_length", - max_length=config.actor_rollout_ref.rollout.response_length, - return_tensors="pt", - return_attention_mask=True, - ) - response_ids, response_attention_mask = outputs["input_ids"], outputs["attention_mask"] - - # response_mask - outputs = tokenizer.pad( - [{"input_ids": input.response_mask} for input in inputs], - padding="max_length", - max_length=config.actor_rollout_ref.rollout.response_length, - return_tensors="pt", - return_attention_mask=False, - ) - response_mask = outputs["input_ids"] - assert response_ids.shape == response_mask.shape, ( - f"mismatch in response_ids and response_mask shape: {response_ids.shape} vs {response_mask.shape}" - ) - response_mask = response_mask * response_attention_mask - - input_ids = torch.cat([prompt_ids, response_ids], dim=1) - attention_mask = torch.cat([prompt_attention_mask, response_attention_mask], dim=1) - position_ids = (attention_mask.cumsum(dim=1) - 1) * attention_mask - - batch = TensorDict( - { - "prompts": prompt_ids, # [bsz, prompt_length] - "responses": response_ids, # [bsz, response_length] - "response_mask": response_mask, # [bsz, response_length] - "input_ids": input_ids, # [bsz, prompt_length + response_length] - "attention_mask": attention_mask, # [bsz, prompt_length + response_length] - "position_ids": position_ids, # [bsz, prompt_length + response_length] - }, - batch_size=len(input_ids), - ) - - num_turns = np.array([input.num_turns for input in inputs], dtype=np.int32) - metrics = [input.metrics.model_dump() for input in inputs] - return DataProto(batch=batch, non_tensor_batch={"__num_turns__": num_turns}, meta_info={"metrics": metrics}) - - -@ray.remote -class AgentLoopWorker: - """Agent loop worker takes a batch of messages and run each message in an agent loop.""" - - def __init__(self, config: DictConfig, server_handles: list[ray.actor.ActorHandle]): - """Initialize agent loop manager. - - Args: - config (DictConfig): YAML config. - server_handles (List[ray.actor.ActorHandle]): OpenAI compatible LLM server actor handles. - """ - self.config = config - self.server_manager = AsyncLLMServerManager(config, server_handles) - - model_path = config.actor_rollout_ref.model.path - self.model_name = "/".join(model_path.split("/")[-2:]) - local_path = copy_to_local(config.actor_rollout_ref.model.path) - self.tokenizer = hf_tokenizer(local_path, trust_remote_code=True) - - agent_loop_config_path = config.actor_rollout_ref.rollout.agent.agent_loop_config_path - if agent_loop_config_path: - agent_loop_configs = OmegaConf.load(agent_loop_config_path) - for agent_loop_config in agent_loop_configs: - _agent_loop_registry[agent_loop_config.name] = agent_loop_config - - trace_config = config.trainer.get("rollout_trace", {}) - trace_config = self.config.actor_rollout_ref.rollout.get("trace", {}) - RolloutTraceConfig.init( - self.config.trainer.project_name, - self.config.trainer.experiment_name, - trace_config.get("backend"), - trace_config.get("token2text", False), - ) - - async def generate_sequences(self, batch: DataProto) -> DataProto: - """Generate sequences from agent loop. - - Args: - batch (DataProto): Input batch. - - Returns: - DataProto: Output batch. - - prompts: [bsz, prompt_length], prompt token ids from dataset. - - responses: [bsz, response_length], output token ids include response tokens - from LLM generation and observation tokens from tool_calls. - - response_mask: [bsz, response_length], 1 for LLM generated tokens, 0 for observation/padding tokens. - - input_ids: [bsz, prompt_length + response_length], whole sequence token ids, including prompt tokens - and response tokens. - - attention_mask: [bsz, prompt_length + response_length], 0 for padding tokens, 1 for other tokens. - - position_ids: [bsz, prompt_length + response_length], incremental position ids. - - For multi-turn conversations: - responses: |<- LLM generation ->|<- tool_calls ->|<- LLM generation ->|<- padding ->| - response_mask: | 1, 1, 1, ..., 1, 1 | 0, 0, .., 0, 0 | 1, 1, 1, ..., 1, 1 | 0, 0, ..., 0| - """ - config = self.config.actor_rollout_ref.rollout - sampling_params = dict( - temperature=config.temperature, - top_p=config.top_p, - repetition_penalty=1.0, - ) - - # override sampling params for validation - if batch.meta_info.get("validate", False): - sampling_params["top_p"] = config.val_kwargs.top_p - sampling_params["temperature"] = config.val_kwargs.temperature - - # by default, we assume it's a single turn agent - if "agent_name" not in batch.non_tensor_batch: - batch.non_tensor_batch["agent_name"] = np.array(["single_turn_agent"] * len(batch), dtype=object) - - tasks = [] - agent_names = batch.non_tensor_batch["agent_name"] - raw_prompts = batch.non_tensor_batch["raw_prompt"] - if "index" in batch.non_tensor_batch: - index = batch.non_tensor_batch["index"] - else: - index = np.arange(len(raw_prompts)) - - trajectory_info = await get_trajectory_info( - batch.meta_info.get("global_steps", -1), index, batch.meta_info.get("validate", False) - ) - - for agent_name, messages, trajectory in zip(agent_names, raw_prompts, trajectory_info, strict=True): - tasks.append( - asyncio.create_task(self._run_agent_loop(agent_name, messages.tolist(), sampling_params, trajectory)) - ) - outputs = await asyncio.gather(*tasks) - - output = postprocess_agent_loop_outputs(outputs, self.tokenizer, self.config) - return output + self.AsyncLLMServerManager = PartialAsyncLLMServerManager + super().__init__(config, server_handles, rm_executor) async def generate_sequences_no_post( - self, batch: DataProto, partial_output_list: Optional[list[AgentLoopOutput]] + self, batch: DataProto, partial_output_list: Optional[list[AgentLoopOutput]] ) -> list[AgentLoopOutput]: """Generate sequences from agent loop. @@ -403,6 +90,7 @@ async def generate_sequences_no_post( temperature=config.temperature, top_p=config.top_p, repetition_penalty=1.0, + logprobs=config.calculate_log_probs, ) # override sampling params for validation @@ -414,9 +102,6 @@ async def generate_sequences_no_post( if "agent_name" not in batch.non_tensor_batch: batch.non_tensor_batch["agent_name"] = np.array(["single_turn_agent"] * len(batch), dtype=object) - tasks = [] - agent_names = batch.non_tensor_batch["agent_name"] - raw_prompts = batch.non_tensor_batch["raw_prompt"] if "index" in batch.non_tensor_batch: index = batch.non_tensor_batch["index"] else: @@ -425,184 +110,61 @@ async def generate_sequences_no_post( trajectory_info = await get_trajectory_info( batch.meta_info.get("global_steps", -1), index, batch.meta_info.get("validate", False) ) + if not partial_output_list: partial_output_list = [None] * len(batch) - for agent_name, messages, trajectory, partial_output in zip( - agent_names, raw_prompts, trajectory_info, partial_output_list, strict=True - ): + tasks = [] + for i in range(len(batch)): + kwargs = {k: v[i] for k, v in batch.non_tensor_batch.items()} tasks.append( asyncio.create_task( - self._run_agent_loop(agent_name, messages.tolist(), sampling_params, trajectory, partial_output) - ) - ) - outputs = await asyncio.gather(*tasks) - - return outputs - - async def _run_agent_loop( - self, - agent_name: str, - messages: list[dict[str, Any]], - sampling_params: dict[str, Any], - trajectory: dict[str, Any], - partial_output: Optional[AgentLoopOutput] = None, + self._partial_run_agent_loop(sampling_params, + trajectory_info[i], + partial_output_list[i], + **kwargs))) + return await asyncio.gather(*tasks) + + async def _partial_run_agent_loop( + self, + sampling_params: dict[str, Any], + trajectory: dict[str, Any], + partial_output: Optional[AgentLoopOutput] = None, + *, + agent_name: str, + **kwargs, ) -> AgentLoopOutput: with rollout_trace_attr( - step=trajectory["step"], - sample_index=trajectory["sample_index"], - rollout_n=trajectory["rollout_n"], - validate=trajectory["validate"], - name="agent_loop", + step=trajectory["step"], + sample_index=trajectory["sample_index"], + rollout_n=trajectory["rollout_n"], + validate=trajectory["validate"], + name="agent_loop", ): assert agent_name in _agent_loop_registry, ( f"Agent loop {agent_name} not registered, registered agent loops: {_agent_loop_registry.keys()}" ) + agent_loop_config = _agent_loop_registry[agent_name] agent_loop = hydra.utils.instantiate( config=agent_loop_config, trainer_config=_DummyConfig(config=self.config), server_manager=self.server_manager, tokenizer=self.tokenizer, + processor=self.processor, ) - if agent_name == "partial_single_turn_agent": - output = await agent_loop.run(messages, sampling_params, partial_output) - else: - output = await agent_loop.run(messages, sampling_params) - return output - - -async def get_trajectory_info(step, index, validate): - """Get trajectory info. - - Args: - step (int): global steps in the trainer. - index (list): form datastore extra_info.index column. - validate (bool): whether is a validate step. - - Returns: - list: trajectory. - """ - trajectory_info = [] - rollout_n = 0 - for i in range(len(index)): - if i > 0 and index[i - 1] == index[i]: - rollout_n += 1 - else: - rollout_n = 0 - trajectory_info.append({"step": step, "sample_index": index[i], "rollout_n": rollout_n, "validate": validate}) - return trajectory_info + return await agent_loop.run(sampling_params, partial_output, **kwargs) -class AgentLoopManager: - """Agent loop manager that manages a group of agent loop workers.""" - - def __init__(self, config: DictConfig, worker_group: RayWorkerGroup): - """Initialize agent loop manager. - - Args: - config (DictConfig): trainer config. - worker_group (RayWorkerGroup): AsyncActorRolloutRefWorker worker group. - """ - self.config = config - self.worker_group = worker_group - - self._initialize_llm_servers() - self._init_agent_loop_workers() - - # Initially we're in sleep mode. - self.sleep() - - def _initialize_llm_servers(self): - self.rollout_tp_size = self.config.actor_rollout_ref.rollout.tensor_model_parallel_size - self.rollout_dp_size = self.worker_group.world_size // self.rollout_tp_size - - register_center = ray.get_actor(f"{self.worker_group.name_prefix}_register_center") - workers_info = ray.get(register_center.get_worker_info.remote()) - assert len(workers_info) == self.worker_group.world_size - - self.async_llm_servers = [None] * self.rollout_dp_size - self.server_addresses = [None] * self.rollout_dp_size - - if self.config.actor_rollout_ref.rollout.agent.custom_async_server: - server_class = async_server_class( - rollout_backend=self.config.actor_rollout_ref.rollout.name, - rollout_backend_module=self.config.actor_rollout_ref.rollout.agent.custom_async_server.path, - rollout_backend_class=self.config.actor_rollout_ref.rollout.agent.custom_async_server.name, - ) - else: - server_class = async_server_class(rollout_backend=self.config.actor_rollout_ref.rollout.name) - - # Start all server instances, restart if address already in use. - unready_dp_ranks = set(range(self.rollout_dp_size)) - while len(unready_dp_ranks) > 0: - servers = { - rollout_dp_rank: server_class.options( - # make sure AsyncvLLMServer colocates with its corresponding workers - scheduling_strategy=ray.util.scheduling_strategies.NodeAffinitySchedulingStrategy( - node_id=workers_info[rollout_dp_rank * self.rollout_tp_size], - soft=False, - ), - name=f"async_llm_server_{rollout_dp_rank}", - ).remote(self.config, self.rollout_dp_size, rollout_dp_rank, self.worker_group.name_prefix) - for rollout_dp_rank in unready_dp_ranks - } - - for rollout_dp_rank, server in servers.items(): - try: - address = ray.get(server.get_server_address.remote()) - self.server_addresses[rollout_dp_rank] = address - self.async_llm_servers[rollout_dp_rank] = server - unready_dp_ranks.remove(rollout_dp_rank) - except Exception: - ray.kill(server) - print(f"rollout server {rollout_dp_rank} failed, maybe address already in use, restarting...") - - # All server instances are ready, init AsyncLLM engine. - ray.get([server.init_engine.remote() for server in self.async_llm_servers]) - - def _init_agent_loop_workers(self): - self.agent_loop_workers = [] - for i in range(self.config.actor_rollout_ref.rollout.agent.num_workers): - self.agent_loop_workers.append( - AgentLoopWorker.options( - name=f"agent_loop_worker_{i}", - ).remote(self.config, self.async_llm_servers) - ) - - def generate_sequences(self, prompts: DataProto) -> DataProto: - """Split input batch and dispatch to agent loop workers. - - Args: - prompts (DataProto): Input batch. - - Returns: - DataProto: Output batch. - """ - if self.config.actor_rollout_ref.rollout.free_cache_engine: - self.wake_up() - chunkes = prompts.chunk(len(self.agent_loop_workers)) - outputs = ray.get( - [ - worker.generate_sequences.remote(chunk) - for worker, chunk in zip(self.agent_loop_workers, chunkes, strict=True) - ] - ) - output = DataProto.concat(outputs) - if self.config.actor_rollout_ref.rollout.free_cache_engine: - self.sleep() - - # calculate performance metrics - metrics = [output.meta_info["metrics"] for output in outputs] # List[List[Dict[str, str]]] - timing = self._performance_metrics(metrics, output) - - output.meta_info = {"timing": timing} - return output +class FullyAgentLoopManager(AgentLoopManager): + def __init__(self, config: DictConfig, worker_group: RayWorkerGroup = None, rm_wg: RayWorkerGroup = None): + super().__init__(config, worker_group, rm_wg) + self.AgentLoopWorker = FullyAgentLoopWorker async def generate_single_sample_async( - self, - sample: DataProto, - partial_output_list: Optional[list[AgentLoopOutput]], + self, + sample: DataProto, + partial_output_list: Optional[list[AgentLoopOutput]], ) -> list[AgentLoopOutput]: """ 异步处理单个样本, 需要复制n次 @@ -629,36 +191,6 @@ def _select_best_worker(self): self._worker_index = (self._worker_index + 1) % len(self.agent_loop_workers) return worker - def _performance_metrics(self, metrics: list[list[dict[str, str]]], output: DataProto) -> dict[str, float]: - timing = {} - t_generate_sequences = np.array([metric["generate_sequences"] for chunk in metrics for metric in chunk]) - t_tool_calls = np.array([metric["tool_calls"] for chunk in metrics for metric in chunk]) - timing["agent_loop/generate_sequences/min"] = t_generate_sequences.min() - timing["agent_loop/generate_sequences/max"] = t_generate_sequences.max() - timing["agent_loop/generate_sequences/mean"] = t_generate_sequences.mean() - timing["agent_loop/tool_calls/min"] = t_tool_calls.min() - timing["agent_loop/tool_calls/max"] = t_tool_calls.max() - timing["agent_loop/tool_calls/mean"] = t_tool_calls.mean() - - # batch sequence generation is bounded by the slowest sample - slowest = np.argmax(t_generate_sequences + t_tool_calls) - attention_mask = output.batch["attention_mask"][slowest] - prompt_length = output.batch["prompts"].shape[1] - timing["agent_loop/slowest/generate_sequences"] = t_generate_sequences[slowest] - timing["agent_loop/slowest/tool_calls"] = t_tool_calls[slowest] - timing["agent_loop/slowest/prompt_length"] = attention_mask[:prompt_length].sum().item() - timing["agent_loop/slowest/response_length"] = attention_mask[prompt_length:].sum().item() - - return timing - - def wake_up(self): - """Wake up all rollout server instances.""" - ray.get([server.wake_up.remote() for server in self.async_llm_servers]) - - def sleep(self): - """Sleep all rollout server instances.""" - ray.get([server.sleep.remote() for server in self.async_llm_servers]) - async def cancel_async(self): """Cancel all rollout tasks asynchronously.""" futures = [server.cancel.remote() for server in self.async_llm_servers] @@ -668,38 +200,3 @@ async def resume_async(self): """Cancel all rollout tasks asynchronously.""" futures = [server.resume.remote() for server in self.async_llm_servers] await asyncio.gather(*[asyncio.wrap_future(future.future()) for future in futures], return_exceptions=True) - - -from verl.workers.rollout.async_server import AsyncServerBase - - -def async_server_class( - rollout_backend: str, rollout_backend_module: Optional[str] = None, rollout_backend_class: Optional[str] = None -) -> type[AsyncServerBase]: - """Get async server class. - - Args: - rollout_backend: str, rollout backend type (alias), should be "vllm". - rollout_backend_module: Optional[str], import path of the rollout backend. - rollout_backend_class: Optional[str], class name of the rollout backend. - - Returns: - Type[AsyncServerBase]: async server class. - """ - if rollout_backend_class is None and rollout_backend_module is None: - # If both are None, use the default backend class - # Do not change the original import behavior - # importlib.import_module and from ... import ... have subtle differences in ray - - if rollout_backend == "vllm": - from recipe.fully_async_policy.vllm_rollout.vllm_async_server import AsyncvLLMServer - return AsyncvLLMServer - else: - raise NotImplementedError(f"rollout backend {rollout_backend} is not supported") - - if rollout_backend_module is None or rollout_backend_class is None: - raise ValueError("rollout_backend_module and rollout_backend_class must be both provided for customization") - - from verl.utils.import_utils import load_extern_type - - return load_extern_type(rollout_backend_module, rollout_backend_class) diff --git a/recipe/fully_async_policy/detach_utils.py b/recipe/fully_async_policy/detach_utils.py index 75d67ec1ab1..69041d923b5 100644 --- a/recipe/fully_async_policy/detach_utils.py +++ b/recipe/fully_async_policy/detach_utils.py @@ -18,15 +18,86 @@ import numpy as np import torch +from tensordict import TensorDict from verl import DataProto -from recipe.fully_async_policy.agent_loop.agent_loop import postprocess_agent_loop_outputs +from verl.experimental.agent_loop.agent_loop import AgentLoopOutput from verl.trainer.ppo.ray_trainer import compute_response_mask -# Calculate the number of samples needed -def calculate_one_step_size(minimal_bsz, ppo_mini_batch_size): - return minimal_bsz * ppo_mini_batch_size +def postprocess_agent_loop_outputs(inputs: list[AgentLoopOutput], tokenizer, config) -> DataProto: + """Static method to postprocess a list of AgentLoopOutput into DataProto + + Args: + inputs: List of AgentLoopOutput + tokenizer: Tokenizer instance + config: Configuration object + + Returns: + DataProto: Processed batch data + """ + # NOTE: consistent with batch version of generate_sequences in vllm_rollout_spmd.py + # prompts: left pad + # responses: right pad + # input_ids: prompt + response + # attention_mask: [0,0,0,0,1,1,1,1, | 1,1,1,0,0,0,0,0] + # position_ids: [0,0,0,0,0,1,2,3, | 4,5,6,7,8,9,10,11] + + # prompts + tokenizer.padding_side = "left" + outputs = tokenizer.pad( + [{"input_ids": input.prompt_ids} for input in inputs], + padding="max_length", + max_length=config.actor_rollout_ref.rollout.prompt_length, + return_tensors="pt", + return_attention_mask=True, + ) + prompt_ids, prompt_attention_mask = outputs["input_ids"], outputs["attention_mask"] + + # responses + tokenizer.padding_side = "right" + outputs = tokenizer.pad( + [{"input_ids": input.response_ids} for input in inputs], + padding="max_length", + max_length=config.actor_rollout_ref.rollout.response_length, + return_tensors="pt", + return_attention_mask=True, + ) + response_ids, response_attention_mask = outputs["input_ids"], outputs["attention_mask"] + + # response_mask + outputs = tokenizer.pad( + [{"input_ids": input.response_mask} for input in inputs], + padding="max_length", + max_length=config.actor_rollout_ref.rollout.response_length, + return_tensors="pt", + return_attention_mask=False, + ) + response_mask = outputs["input_ids"] + assert response_ids.shape == response_mask.shape, ( + f"mismatch in response_ids and response_mask shape: {response_ids.shape} vs {response_mask.shape}" + ) + response_mask = response_mask * response_attention_mask + + input_ids = torch.cat([prompt_ids, response_ids], dim=1) + attention_mask = torch.cat([prompt_attention_mask, response_attention_mask], dim=1) + position_ids = (attention_mask.cumsum(dim=1) - 1) * attention_mask + + batch = TensorDict( + { + "prompts": prompt_ids, # [bsz, prompt_length] + "responses": response_ids, # [bsz, response_length] + "response_mask": response_mask, # [bsz, response_length] + "input_ids": input_ids, # [bsz, prompt_length + response_length] + "attention_mask": attention_mask, # [bsz, prompt_length + response_length] + "position_ids": position_ids, # [bsz, prompt_length + response_length] + }, + batch_size=len(input_ids), + ) + + num_turns = np.array([input.num_turns for input in inputs], dtype=np.int32) + metrics = [input.metrics.model_dump() for input in inputs] + return DataProto(batch=batch, non_tensor_batch={"__num_turns__": num_turns}, meta_info={"metrics": metrics}) @dataclass @@ -157,7 +228,7 @@ def merge_rollout_sample(config, tokenizer, rs: RolloutSample): def assemble_batch_from_rollout_samples( - rollout_samples: list[RolloutSample], tokenizer, config, balance_batch=None + rollout_samples: list[RolloutSample], tokenizer, config, balance_batch=None ) -> DataProto: """ Assemble gen_batch_output from RolloutSample objects @@ -368,7 +439,7 @@ def _special_metrics_aggergate(self, aggregated: dict[str, Any]) -> dict[str, An REQUIRED_PERF_KEYS = {"perf/throughput", "perf/total_num_tokens", "perf/time_per_step"} if REQUIRED_PERF_KEYS.issubset(aggregated): aggregated["perf/throughput"] = aggregated["perf/total_num_tokens"] / ( - aggregated["perf/time_per_step"] * self.total_gpus + aggregated["perf/time_per_step"] * self.total_gpus ) return aggregated diff --git a/recipe/fully_async_policy/fsdp_workers.py b/recipe/fully_async_policy/fsdp_workers.py index 7a1b59aa64c..82c23bfa04f 100644 --- a/recipe/fully_async_policy/fsdp_workers.py +++ b/recipe/fully_async_policy/fsdp_workers.py @@ -38,8 +38,9 @@ ) from verl.utils.import_utils import import_external_libs from verl.utils.model import get_generation_config, update_model_config -from verl.utils.vllm_utils import patch_vllm_moe_model_weight_loader +from verl.workers.config import HFModelConfig, RolloutConfig from verl.workers.fsdp_workers import ActorRolloutRefWorker, AsyncActorRolloutRefWorker, CriticWorker +from verl.workers.rollout import get_rollout_class logger = logging.getLogger(__file__) logger.setLevel(os.getenv("VERL_LOGGING_LEVEL", "WARN")) @@ -82,6 +83,9 @@ def sync_rollout_weights(self): params = self._get_actor_params() if self._is_actor else None if self._is_rollout: inference_model = get_inference_model(self.rollout) + + from verl.utils.vllm.patch import patch_vllm_moe_model_weight_loader + patch_vllm_moe_model_weight_loader(inference_model) for key, shape, dtype in self._weights_info: tensor = torch.empty(shape, dtype=dtype, device=get_torch_device().current_device()) @@ -207,32 +211,28 @@ def init_model(self): rollout_device_mesh = init_device_mesh( device_name, mesh_shape=(dp, infer_tp), mesh_dim_names=["dp", "infer_tp"] ) + + is_collect = rollout_device_mesh["infer_tp"].get_local_rank() == 0 + self._register_dispatch_collect_info( + "rollout", dp_rank=rollout_device_mesh["dp"].get_local_rank(), is_collect=is_collect + ) + rollout_name = self.config.rollout.name assert rollout_name == "vllm" - from verl.workers.rollout.vllm_rollout import vLLMRollout + rollout_config: RolloutConfig = omega_conf_to_dataclass(self.config.rollout) + model_config: HFModelConfig = omega_conf_to_dataclass(self.config.model, dataclass_type=HFModelConfig) log_gpu_memory_usage(f"Before building {rollout_name} rollout", logger=logger) - - from verl.workers.rollout.vllm_rollout import vLLMAsyncRollout - - vllm_rollout_cls = vLLMRollout if self.config.rollout.mode == "sync" else vLLMAsyncRollout - rollout = vllm_rollout_cls( - model_path=local_path, - config=self.config.rollout, - tokenizer=self.tokenizer, - model_hf_config=actor_model_config, - device_mesh=rollout_device_mesh, - trust_remote_code=trust_remote_code, + rollout = get_rollout_class(rollout_config.name, rollout_config.mode)( + config=rollout_config, model_config=model_config, device_mesh=rollout_device_mesh ) log_gpu_memory_usage(f"After building {rollout_name} rollout", logger=logger) - from .detach_sharding_manager import DetachShardingManager sharding_manager = DetachShardingManager( inference_engine=rollout.inference_engine, device_mesh=rollout_device_mesh ) - log_gpu_memory_usage("After building sharding manager", logger=logger) self.rollout = rollout diff --git a/recipe/fully_async_policy/fully_async_main.py b/recipe/fully_async_policy/fully_async_main.py index c0f156296a2..b98b3f426e0 100644 --- a/recipe/fully_async_policy/fully_async_main.py +++ b/recipe/fully_async_policy/fully_async_main.py @@ -24,8 +24,9 @@ from recipe.fully_async_policy.fully_async_rollouter import FullyAsyncRollouter from recipe.fully_async_policy.fully_async_trainer import FullyAsyncTrainer from recipe.fully_async_policy.message_queue import MessageQueue, MessageQueueClient -from recipe.fully_async_policy.ray_trainer import ResourcePoolManager, Role +from verl.trainer.ppo.ray_trainer import ResourcePoolManager from verl.trainer.ppo.reward import load_reward_manager +from verl.trainer.ppo.utils import Role from verl.utils.fs import copy_to_local @@ -270,7 +271,7 @@ def _run_training_loop(self): for future in done_futures: try: ray.get(future) - print(f"[ASYNC MAIN] One component completed successfully") + print("[ASYNC MAIN] One component completed successfully") except Exception as e: print(f"[ASYNC MAIN] Component failed with error: {e}") for remaining_future in remaining_futures: @@ -291,7 +292,7 @@ def _run_training_loop(self): @hydra.main(config_path="config", config_name="fully_async_ppo_trainer", version_base=None) def main(config): - from recipe.fully_async_policy.main_ppo import run_ppo + from verl.trainer.main_ppo import run_ppo # Ensure async training config exists if not hasattr(config, "async_training"): diff --git a/recipe/fully_async_policy/fully_async_rollouter.py b/recipe/fully_async_policy/fully_async_rollouter.py index 919314ba1b5..e53e6c43ef5 100644 --- a/recipe/fully_async_policy/fully_async_rollouter.py +++ b/recipe/fully_async_policy/fully_async_rollouter.py @@ -24,14 +24,16 @@ prepare_single_generation_data, ) from recipe.fully_async_policy.message_queue import MessageQueueClient +from recipe.fully_async_policy.ray_trainer import FullyAsyncRayPPOTrainer from verl.single_controller.ray import RayClassWithInitArgs, RayWorkerGroup -from recipe.fully_async_policy.ray_trainer import RayPPOTrainer, ResourcePoolManager, Role, WorkerType +from verl.trainer.ppo.ray_trainer import ResourcePoolManager +from verl.trainer.ppo.utils import Role, WorkerType from verl.utils.profiler import marked_timer from verl.utils.tracking import ValidationGenerationsLogger @ray.remote(num_cpus=10, max_concurrency=100) -class FullyAsyncRollouter(RayPPOTrainer): +class FullyAsyncRollouter(FullyAsyncRayPPOTrainer): """ Asynchronous sample generator, responsible for continuously generating training samples and putting them into MessageQueue @@ -227,7 +229,6 @@ def _validate_config(self): if not hasattr(self.config, "async_training"): raise ValueError("[FullyAsyncRollouter] Missing async_training configuration") assert self.config.actor_rollout_ref.rollout.calculate_log_probs, "must rollout calculate log_probs" - super()._validate_config() def _create_actor_rollout_classes(self): # only create rollout @@ -257,10 +258,10 @@ def _create_continuous_iterator(self): def _init_async_rollout_manager(self): # create async rollout manager and request scheduler assert self.config.actor_rollout_ref.rollout.mode == "async" - from recipe.fully_async_policy.agent_loop import AgentLoopManager + from recipe.fully_async_policy.agent_loop import FullyAgentLoopManager self.async_rollout_mode = True - self.async_rollout_manager = AgentLoopManager( + self.async_rollout_manager = FullyAgentLoopManager( config=self.config, worker_group=self.rollout_wg, ) diff --git a/recipe/fully_async_policy/fully_async_trainer.py b/recipe/fully_async_policy/fully_async_trainer.py index 0c1501cbf89..4cba527c857 100644 --- a/recipe/fully_async_policy/fully_async_trainer.py +++ b/recipe/fully_async_policy/fully_async_trainer.py @@ -28,20 +28,17 @@ assemble_batch_from_rollout_samples, ) from recipe.fully_async_policy.message_queue import MessageQueueClient +from recipe.fully_async_policy.ray_trainer import FullyAsyncRayPPOTrainer from verl.single_controller.ray import RayClassWithInitArgs, RayWorkerGroup from verl.trainer.ppo import core_algos from verl.trainer.ppo.core_algos import AdvantageEstimator -from recipe.fully_async_policy.ray_trainer import ( - RayPPOTrainer, - ResourcePoolManager, - Role, - WorkerType, -) +from verl.trainer.ppo.ray_trainer import ResourcePoolManager +from verl.trainer.ppo.utils import Role, WorkerType from verl.utils.debug import marked_timer @ray.remote(num_cpus=10) -class FullyAsyncTrainer(RayPPOTrainer): +class FullyAsyncTrainer(FullyAsyncRayPPOTrainer): """ A fully asynchronous PPO trainer that obtains samples from a MessageQueue for training. Based on an improved implementation of OneStepOffRayTrainer diff --git a/recipe/fully_async_policy/main_ppo.py b/recipe/fully_async_policy/main_ppo.py deleted file mode 100644 index 4b240c6ffbf..00000000000 --- a/recipe/fully_async_policy/main_ppo.py +++ /dev/null @@ -1,344 +0,0 @@ -# Copyright 2024 Bytedance Ltd. and/or its affiliates -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -""" -Note that we don't combine the main with ray_trainer as ray_trainer is used by other main. -""" - -import os -import socket - -import hydra -import ray -from omegaconf import OmegaConf - -from verl.experimental.dataset.sampler import AbstractSampler -from verl.trainer.constants_ppo import get_ppo_ray_runtime_env -from verl.trainer.ppo.ray_trainer import RayPPOTrainer -from verl.trainer.ppo.reward import load_reward_manager -from verl.utils.device import is_cuda_available -from verl.utils.import_utils import load_extern_type - - -@hydra.main(config_path="config", config_name="ppo_trainer", version_base=None) -def main(config): - """Main entry point for PPO training with Hydra configuration management. - - Args: - config_dict: Hydra configuration dictionary containing training parameters. - """ - from time import time - - start_time = time() - run_ppo(config) - print(f"total time: {time() - start_time:.2f} seconds") - - -# Define a function to run the PPO-like training process -def run_ppo(config, task_runner_class=None) -> None: - """Initialize Ray cluster and run distributed PPO training process. - - Args: - config: Training configuration object containing all necessary parameters - for distributed PPO training including Ray initialization settings, - model paths, and training hyperparameters. - """ - # Check if Ray is not initialized - if not ray.is_initialized(): - # Initialize Ray with a local cluster configuration - # Set environment variables in the runtime environment to control tokenizer parallelism, - # NCCL debug level, VLLM logging level, and allow runtime LoRA updating - # `num_cpus` specifies the number of CPU cores Ray can use, obtained from the configuration - ray.init( - runtime_env=get_ppo_ray_runtime_env(), - num_cpus=config.ray_init.num_cpus, - ) - # for recipe to change TaskRunner - if task_runner_class is None: - task_runner_class = TaskRunner - - # Create a remote instance of the TaskRunner class, and - # Execute the `run` method of the TaskRunner instance remotely and wait for it to complete - if ( - is_cuda_available - and config.trainer.get("profile_steps") is not None - and len(config.trainer.get("profile_steps", [])) > 0 - ): - nsight_options = OmegaConf.to_container(config.trainer.controller_nsight_options) - runner = task_runner_class.options(runtime_env={"nsight": nsight_options}).remote() - else: - runner = task_runner_class.remote() - ray.get(runner.run.remote(config)) - - # [Optional] get the path of the timeline trace file from the configuration, default to None - # This file is used for performance analysis - timeline_json_file = config.ray_init.get("timeline_json_file", None) - if timeline_json_file: - ray.timeline(filename=timeline_json_file) - - -@ray.remote(num_cpus=1) # please make sure main_task is not scheduled on head -class TaskRunner: - """Ray remote class for executing distributed PPO training tasks. - - This class encapsulates the main training logic and runs as a Ray remote actor - to enable distributed execution across multiple nodes and GPUs. - """ - - def run(self, config): - """Execute the main PPO training workflow. - - This method sets up the distributed training environment, initializes - workers, datasets, and reward functions, then starts the training process. - - Args: - config: Training configuration object containing all parameters needed - for setting up and running the PPO training process. - """ - # Print the initial configuration. `resolve=True` will evaluate symbolic values. - from pprint import pprint - - from omegaconf import OmegaConf - - from verl.utils.fs import copy_to_local - - print(f"TaskRunner hostname: {socket.gethostname()}, PID: {os.getpid()}") - pprint(OmegaConf.to_container(config, resolve=True)) - OmegaConf.resolve(config) - - # Download the checkpoint from HDFS to the local machine. - # `use_shm` determines whether to use shared memory, which could lead to faster model loading if turned on - local_path = copy_to_local( - config.actor_rollout_ref.model.path, use_shm=config.actor_rollout_ref.model.get("use_shm", False) - ) - - # Instantiate the tokenizer and processor. - from verl.utils import hf_processor, hf_tokenizer - - trust_remote_code = config.data.get("trust_remote_code", False) - tokenizer = hf_tokenizer(local_path, trust_remote_code=trust_remote_code) - # Used for multimodal LLM, could be None - processor = hf_processor(local_path, trust_remote_code=trust_remote_code, use_fast=True) - - # Define worker classes based on the actor strategy. - if config.actor_rollout_ref.actor.strategy in {"fsdp", "fsdp2"}: - assert config.critic.strategy in {"fsdp", "fsdp2"} - from verl.single_controller.ray import RayWorkerGroup - from verl.workers.fsdp_workers import ActorRolloutRefWorker, AsyncActorRolloutRefWorker - - use_legacy_worker_impl = config.trainer.get("use_legacy_worker_impl", "auto") - if use_legacy_worker_impl in ["auto", "enable"]: - # import warnings - # warnings.warn(f"Legacy worker impl is going to be deprecated, will be removed in the future. \ - # Please set trainer.use_legacy_worker_impl = false to switch to the new worker implementation.") - from verl.workers.fsdp_workers import CriticWorker - elif use_legacy_worker_impl == "disable": - from verl.workers.roles import CriticWorker - - print("Using new worker implementation") - else: - raise ValueError(f"Invalid use_legacy_worker_impl: {use_legacy_worker_impl}") - - actor_rollout_cls = ( - AsyncActorRolloutRefWorker - if config.actor_rollout_ref.rollout.mode == "async" - else ActorRolloutRefWorker - ) - ray_worker_group_cls = RayWorkerGroup - - elif config.actor_rollout_ref.actor.strategy == "megatron": - assert config.actor_rollout_ref.actor.strategy == config.critic.strategy - from verl.single_controller.ray.megatron import NVMegatronRayWorkerGroup - from verl.workers.megatron_workers import ActorRolloutRefWorker, AsyncActorRolloutRefWorker, CriticWorker - - actor_rollout_cls = ( - AsyncActorRolloutRefWorker - if config.actor_rollout_ref.rollout.mode == "async" - else ActorRolloutRefWorker - ) - ray_worker_group_cls = NVMegatronRayWorkerGroup - - else: - raise NotImplementedError - - from verl.trainer.ppo.ray_trainer import ResourcePoolManager, Role - - # Map roles to their corresponding remote worker classes. - role_worker_mapping = { - Role.ActorRollout: ray.remote(actor_rollout_cls), - Role.Critic: ray.remote(CriticWorker), - } - - # Define the resource pool specification. - # Map roles to the resource pool. - global_pool_id = "global_pool" - resource_pool_spec = { - global_pool_id: [config.trainer.n_gpus_per_node] * config.trainer.nnodes, - } - mapping = { - Role.ActorRollout: global_pool_id, - Role.Critic: global_pool_id, - } - - # We should adopt a multi-source reward function here: - # - for rule-based rm, we directly call a reward score - # - for model-based rm, we call a model - # - for code related prompt, we send to a sandbox if there are test cases - # finally, we combine all the rewards together - # The reward type depends on the tag of the data - if config.reward_model.enable: - if config.reward_model.strategy in {"fsdp", "fsdp2"}: - from verl.workers.fsdp_workers import RewardModelWorker - elif config.reward_model.strategy == "megatron": - from verl.workers.megatron_workers import RewardModelWorker - else: - raise NotImplementedError - role_worker_mapping[Role.RewardModel] = ray.remote(RewardModelWorker) - mapping[Role.RewardModel] = global_pool_id - - # Add a reference policy worker if KL loss or KL reward is used. - if config.algorithm.use_kl_in_reward or config.actor_rollout_ref.actor.use_kl_loss: - role_worker_mapping[Role.RefPolicy] = ray.remote(ActorRolloutRefWorker) - mapping[Role.RefPolicy] = global_pool_id - - # Load the reward manager for training and validation. - reward_fn = load_reward_manager( - config, tokenizer, num_examine=0, **config.reward_model.get("reward_kwargs", {}) - ) - val_reward_fn = load_reward_manager( - config, tokenizer, num_examine=1, **config.reward_model.get("reward_kwargs", {}) - ) - resource_pool_manager = ResourcePoolManager(resource_pool_spec=resource_pool_spec, mapping=mapping) - - from verl.utils.dataset.rl_dataset import collate_fn - - # Create training and validation datasets. - train_dataset = create_rl_dataset(config.data.train_files, config.data, tokenizer, processor, is_train=True) - val_dataset = create_rl_dataset(config.data.val_files, config.data, tokenizer, processor, is_train=False) - train_sampler = create_rl_sampler(config.data, train_dataset) - - # Initialize the PPO trainer. - trainer = RayPPOTrainer( - config=config, - tokenizer=tokenizer, - processor=processor, - role_worker_mapping=role_worker_mapping, - resource_pool_manager=resource_pool_manager, - ray_worker_group_cls=ray_worker_group_cls, - reward_fn=reward_fn, - val_reward_fn=val_reward_fn, - train_dataset=train_dataset, - val_dataset=val_dataset, - collate_fn=collate_fn, - train_sampler=train_sampler, - ) - # Initialize the workers of the trainer. - trainer.init_workers() - # Start the training process. - trainer.fit() - - -def create_rl_dataset(data_paths, data_config, tokenizer, processor, is_train=True): - """Create a dataset. - - Arguments: - data_paths: List of paths to data files. - data_config: The data config. - tokenizer (Tokenizer): The tokenizer. - processor (Processor): The processor. - - Returns: - dataset (Dataset): The dataset. - """ - from torch.utils.data import Dataset - - from verl.utils.dataset.rl_dataset import RLHFDataset - - # Check if a custom dataset class is specified in the data configuration - # and if the path to the custom class is provided - if "custom_cls" in data_config and data_config.custom_cls.get("path", None) is not None: - # Dynamically load the custom dataset class - dataset_cls = load_extern_type(data_config.custom_cls.path, data_config.custom_cls.name) - # Verify that the custom dataset class inherits from torch.utils.data.Dataset - if not issubclass(dataset_cls, Dataset): - raise TypeError( - f"The custom dataset class '{data_config.custom_cls.name}' from " - f"'{data_config.custom_cls.path}' must inherit from torch.utils.data.Dataset" - ) - elif "datagen" in data_config and data_config.datagen.get("path", None) is not None and is_train: - # If a data generation strategy is specified, use the DynamicGenDataset class - from verl.utils.dataset.dynamicgen_dataset import DynamicGenDataset - - dataset_cls = DynamicGenDataset - print("Using DynamicGenDataset for data generation.") - - else: - # Use the default RLHFDataset class if no custom class is specified - dataset_cls = RLHFDataset - print(f"Using dataset class: {dataset_cls.__name__}") - - # Instantiate the dataset using the determined dataset class - dataset = dataset_cls( - data_files=data_paths, - tokenizer=tokenizer, - processor=processor, - config=data_config, - ) - - return dataset - - -def create_rl_sampler(data_config, dataset): - """Create a sampler for the dataset. - - Arguments: - data_config: The data config. - dataset (Dataset): The dataset. - - Returns: - sampler (Sampler): The sampler. - """ - import torch - from torch.utils.data import RandomSampler, SequentialSampler - - if data_config.sampler is not None and data_config.sampler.get("class_path", None) is not None: - curriculum_class = load_extern_type( - data_config.sampler.class_path, - data_config.sampler.class_name, - ) - sampler = curriculum_class( - data_source=dataset, - data_config=data_config, - ) - assert isinstance(sampler, AbstractSampler) - assert data_config.get("dataloader_num_workers", 8) == 0, ( - "If using curriculum, num_workers must be 0 to prevent data caching. " - "If the dataloader caches data before the batch is done the " - "curriculum sampler won't have the opportunity to reorder it. " - ) - - # Use a sampler to facilitate checkpoint resumption. - # If shuffling is enabled in the data configuration, create a random sampler. - elif data_config.shuffle: - train_dataloader_generator = torch.Generator() - train_dataloader_generator.manual_seed(data_config.get("seed", 1)) - sampler = RandomSampler(data_source=dataset, generator=train_dataloader_generator) - else: - # If shuffling is disabled, use a sequential sampler to iterate through the dataset in order. - sampler = SequentialSampler(data_source=dataset) - - return sampler - - -if __name__ == "__main__": - main() diff --git a/recipe/fully_async_policy/ray_trainer.py b/recipe/fully_async_policy/ray_trainer.py index dea3aa2c26e..0a74c5ed386 100644 --- a/recipe/fully_async_policy/ray_trainer.py +++ b/recipe/fully_async_policy/ray_trainer.py @@ -18,790 +18,41 @@ This trainer supports model-agonistic model initialization with huggingface """ -import json -import os import uuid -import warnings -from collections import defaultdict from copy import deepcopy -from dataclasses import dataclass, field -from enum import Enum from pprint import pprint -from typing import Optional import numpy as np import ray import torch -from omegaconf import OmegaConf, open_dict -from torch.utils.data import Dataset, Sampler -from torchdata.stateful_dataloader import StatefulDataLoader +from omegaconf import OmegaConf from tqdm import tqdm from verl import DataProto from verl.experimental.dataset.sampler import AbstractCurriculumSampler -from verl.protocol import pad_dataproto_to_divisor, unpad_dataproto -from verl.single_controller.base import Worker -from verl.single_controller.ray import RayClassWithInitArgs, RayResourcePool, RayWorkerGroup +from verl.single_controller.ray import RayClassWithInitArgs from verl.single_controller.ray.base import create_colocated_worker_cls -from verl.trainer.config import AlgoConfig -from verl.trainer.ppo import core_algos from verl.trainer.ppo.core_algos import AdvantageEstimator, agg_loss from verl.trainer.ppo.metric_utils import ( compute_data_metrics, compute_throughout_metrics, compute_timing_metrics, - process_validation_metrics, ) +from verl.trainer.ppo.ray_trainer import RayPPOTrainer, compute_advantage from verl.trainer.ppo.reward import compute_reward, compute_reward_async -from verl.utils.checkpoint.checkpoint_manager import find_latest_ckpt_path, should_save_ckpt_esi +from verl.trainer.ppo.utils import Role +from verl.utils.checkpoint.checkpoint_manager import should_save_ckpt_esi from verl.utils.config import omega_conf_to_dataclass from verl.utils.debug import marked_timer from verl.utils.metric import ( reduce_metrics, ) -from verl.utils.seqlen_balancing import get_seqlen_balanced_partitions, log_seqlen_unbalance -from verl.utils.torch_functional import masked_mean -from verl.utils.tracking import ValidationGenerationsLogger - -WorkerType = type[Worker] - - -class Role(Enum): - """ - To create more roles dynamically, you can subclass Role and add new members - """ - - Actor = 0 - Rollout = 1 - ActorRollout = 2 - Critic = 3 - RefPolicy = 4 - RewardModel = 5 - ActorRolloutRef = 6 - - def __str__(self): - """返回与代码中一致的字符串表示""" - return self._get_role_string() - - def _get_role_string(self): - """获取角色对应的字符串名称""" - role_mapping = { - Role.Actor: "actor", - Role.Rollout: "rollout", - Role.ActorRollout: "actor_rollout", - Role.Critic: "critic", - Role.RefPolicy: "ref", - Role.RewardModel: "rm", - Role.ActorRolloutRef: "actor_rollout_ref", - } - return role_mapping.get(self, self.name.lower()) - - @classmethod - def from_string(cls, name: str): - """从字符串创建Role实例""" - string_mapping = { - "actor": cls.Actor, - "rollout": cls.Rollout, - "actor_rollout": cls.ActorRollout, - "critic": cls.Critic, - "ref": cls.RefPolicy, - "rm": cls.RewardModel, - "actor_rollout_ref": cls.ActorRolloutRef, - } - role = string_mapping.get(name.lower()) - if role is None: - raise ValueError(f"No Role found for string: {name}") - return role - - -@dataclass -class ResourcePoolManager: - """ - Define a resource pool specification. Resource pool will be initialized first. - """ - - resource_pool_spec: dict[str, list[int]] - mapping: dict[Role, str] - resource_pool_dict: dict[str, RayResourcePool] = field(default_factory=dict) - - def create_resource_pool(self): - """Create Ray resource pools for distributed training. - - Initializes resource pools based on the resource pool specification, - with each pool managing GPU resources across multiple nodes. - For FSDP backend, uses max_colocate_count=1 to merge WorkerGroups. - For Megatron backend, uses max_colocate_count>1 for different models. - """ - for resource_pool_name, process_on_nodes in self.resource_pool_spec.items(): - # max_colocate_count means the number of WorkerGroups (i.e. processes) in each RayResourcePool - # For FSDP backend, we recommend using max_colocate_count=1 that merge all WorkerGroups into one. - # For Megatron backend, we recommend using max_colocate_count>1 - # that can utilize different WorkerGroup for differnt models - resource_pool = RayResourcePool( - process_on_nodes=process_on_nodes, use_gpu=True, max_colocate_count=1, name_prefix=resource_pool_name - ) - self.resource_pool_dict[resource_pool_name] = resource_pool - - self._check_resource_available() - - def get_resource_pool(self, role: Role) -> RayResourcePool: - """Get the resource pool of the worker_cls""" - return self.resource_pool_dict[self.mapping[role]] - - def get_n_gpus(self) -> int: - """Get the number of gpus in this cluster.""" - return sum([n_gpus for process_on_nodes in self.resource_pool_spec.values() for n_gpus in process_on_nodes]) - - def _check_resource_available(self): - """Check if the resource pool can be satisfied in this ray cluster.""" - node_available_resources = ray.state.available_resources_per_node() - node_available_gpus = { - node: node_info.get("GPU", 0) if "GPU" in node_info else node_info.get("NPU", 0) - for node, node_info in node_available_resources.items() - } - - # check total required gpus can be satisfied - total_available_gpus = sum(node_available_gpus.values()) - total_required_gpus = sum( - [n_gpus for process_on_nodes in self.resource_pool_spec.values() for n_gpus in process_on_nodes] - ) - if total_available_gpus < total_required_gpus: - raise ValueError( - f"Total available GPUs {total_available_gpus} is less than total desired GPUs {total_required_gpus}" - ) - - # check each resource pool can be satisfied, O(#resource_pools * #nodes) - for resource_pool_name, process_on_nodes in self.resource_pool_spec.items(): - num_gpus, num_nodes = process_on_nodes[0], len(process_on_nodes) - for node, available_gpus in node_available_gpus.items(): - if available_gpus >= num_gpus: - node_available_gpus[node] -= num_gpus - num_nodes -= 1 - if num_nodes == 0: - break - if num_nodes > 0: - raise ValueError( - f"Resource pool {resource_pool_name}: {num_gpus}*{num_nodes}" - + "cannot be satisfied in this ray cluster" - ) - - -def apply_kl_penalty(data: DataProto, kl_ctrl: core_algos.AdaptiveKLController, kl_penalty="kl"): - """Apply KL penalty to the token-level rewards. - - This function computes the KL divergence between the reference policy and current policy, - then applies a penalty to the token-level rewards based on this divergence. - - Args: - data (DataProto): The data containing batched model outputs and inputs. - kl_ctrl (core_algos.AdaptiveKLController): Controller for adaptive KL penalty. - kl_penalty (str, optional): Type of KL penalty to apply. Defaults to "kl". - multi_turn (bool, optional): Whether the data is from a multi-turn conversation. Defaults to False. - - Returns: - tuple: A tuple containing: - - The updated data with token-level rewards adjusted by KL penalty - - A dictionary of metrics related to the KL penalty - """ - response_mask = data.batch["response_mask"] - token_level_scores = data.batch["token_level_scores"] - batch_size = data.batch.batch_size[0] - - # compute kl between ref_policy and current policy - # When apply_kl_penalty, algorithm.use_kl_in_reward=True, so the reference model has been enabled. - kld = core_algos.kl_penalty( - data.batch["old_log_probs"], data.batch["ref_log_prob"], kl_penalty=kl_penalty - ) # (batch_size, response_length) - kld = kld * response_mask - beta = kl_ctrl.value - - token_level_rewards = token_level_scores - beta * kld - - current_kl = masked_mean(kld, mask=response_mask, axis=-1) # average over sequence - current_kl = torch.mean(current_kl, dim=0).item() - - # according to https://github.com/huggingface/trl/blob/951ca1841f29114b969b57b26c7d3e80a39f75a0/trl/trainer/ppo_trainer.py#L837 - kl_ctrl.update(current_kl=current_kl, n_steps=batch_size) - data.batch["token_level_rewards"] = token_level_rewards - - metrics = {"actor/reward_kl_penalty": current_kl, "actor/reward_kl_penalty_coeff": beta} - - return data, metrics - - -def compute_response_mask(data: DataProto): - """Compute the attention mask for the response part of the sequence. - - This function extracts the portion of the attention mask that corresponds to the model's response, - which is used for masking computations that should only apply to response tokens. - - Args: - data (DataProto): The data containing batched model outputs and inputs. - - Returns: - torch.Tensor: The attention mask for the response tokens. - """ - responses = data.batch["responses"] - response_length = responses.size(1) - attention_mask = data.batch["attention_mask"] - return attention_mask[:, -response_length:] - - -def compute_advantage( - data: DataProto, - adv_estimator: AdvantageEstimator, - gamma: float = 1.0, - lam: float = 1.0, - num_repeat: int = 1, - norm_adv_by_std_in_grpo: bool = True, - config: Optional[AlgoConfig] = None, -) -> DataProto: - """Compute advantage estimates for policy optimization. - - This function computes advantage estimates using various estimators like GAE, GRPO, REINFORCE++, etc. - The advantage estimates are used to guide policy optimization in RL algorithms. - - Args: - data (DataProto): The data containing batched model outputs and inputs. - adv_estimator (AdvantageEstimator): The advantage estimator to use (e.g., GAE, GRPO, REINFORCE++). - gamma (float, optional): Discount factor for future rewards. Defaults to 1.0. - lam (float, optional): Lambda parameter for GAE. Defaults to 1.0. - num_repeat (int, optional): Number of times to repeat the computation. Defaults to 1. - norm_adv_by_std_in_grpo (bool, optional): Whether to normalize advantages by standard deviation in - GRPO. Defaults to True. - config (dict, optional): Configuration dictionary for algorithm settings. Defaults to None. - - Returns: - DataProto: The updated data with computed advantages and returns. - """ - # Back-compatible with trainers that do not compute response mask in fit - if "response_mask" not in data.batch.keys(): - data.batch["response_mask"] = compute_response_mask(data) - # prepare response group - if adv_estimator == AdvantageEstimator.GAE: - # Compute advantages and returns using Generalized Advantage Estimation (GAE) - advantages, returns = core_algos.compute_gae_advantage_return( - token_level_rewards=data.batch["token_level_rewards"], - values=data.batch["values"], - response_mask=data.batch["response_mask"], - gamma=gamma, - lam=lam, - ) - data.batch["advantages"] = advantages - data.batch["returns"] = returns - if config.get("use_pf_ppo", False): - data = core_algos.compute_pf_ppo_reweight_data( - data, - config.pf_ppo.get("reweight_method"), - config.pf_ppo.get("weight_pow"), - ) - elif adv_estimator == AdvantageEstimator.GRPO: - # Initialize the mask for GRPO calculation - grpo_calculation_mask = data.batch["response_mask"] - # Call compute_grpo_outcome_advantage with parameters matching its definition - advantages, returns = core_algos.compute_grpo_outcome_advantage( - token_level_rewards=data.batch["token_level_rewards"], - response_mask=grpo_calculation_mask, - index=data.non_tensor_batch["uid"], - norm_adv_by_std_in_grpo=norm_adv_by_std_in_grpo, - ) - data.batch["advantages"] = advantages - data.batch["returns"] = returns - else: - # handle all other adv estimator type other than GAE and GRPO - adv_estimator_fn = core_algos.get_adv_estimator_fn(adv_estimator) - adv_kwargs = { - "token_level_rewards": data.batch["token_level_rewards"], - "response_mask": data.batch["response_mask"], - "config": config, - } - if "uid" in data.non_tensor_batch: # optional - adv_kwargs["index"] = data.non_tensor_batch["uid"] - if "reward_baselines" in data.batch: # optional - adv_kwargs["reward_baselines"] = data.batch["reward_baselines"] - - # calculate advantage estimator - advantages, returns = adv_estimator_fn(**adv_kwargs) - data.batch["advantages"] = advantages - data.batch["returns"] = returns - return data - - -class RayPPOTrainer: - """Distributed PPO trainer using Ray for scalable reinforcement learning. - - This trainer orchestrates distributed PPO training across multiple nodes and GPUs, - managing actor rollouts, critic training, and reward computation with Ray backend. - Supports various model architectures including FSDP, Megatron, and vLLM integration. - """ - - # TODO: support each role have individual ray_worker_group_cls, - # i.e., support different backend of different role - def __init__( - self, - config, - tokenizer, - role_worker_mapping: dict[Role, WorkerType], - resource_pool_manager: ResourcePoolManager, - ray_worker_group_cls: RayWorkerGroup = RayWorkerGroup, - processor=None, - reward_fn=None, - val_reward_fn=None, - train_dataset: Optional[Dataset] = None, - val_dataset: Optional[Dataset] = None, - collate_fn=None, - train_sampler: Optional[Sampler] = None, - device_name=None, - ): - """ - Initialize distributed PPO trainer with Ray backend. - Note that this trainer runs on the driver process on a single CPU/GPU node. - - Args: - config: Configuration object containing training parameters. - tokenizer: Tokenizer used for encoding and decoding text. - role_worker_mapping (dict[Role, WorkerType]): Mapping from roles to worker classes. - resource_pool_manager (ResourcePoolManager): Manager for Ray resource pools. - ray_worker_group_cls (RayWorkerGroup, optional): Class for Ray worker groups. Defaults to RayWorkerGroup. - processor: Optional data processor, used for multimodal data - reward_fn: Function for computing rewards during training. - val_reward_fn: Function for computing rewards during validation. - train_dataset (Optional[Dataset], optional): Training dataset. Defaults to None. - val_dataset (Optional[Dataset], optional): Validation dataset. Defaults to None. - collate_fn: Function to collate data samples into batches. - train_sampler (Optional[Sampler], optional): Sampler for the training dataset. Defaults to None. - device_name (str, optional): Device name for training (e.g., "cuda", "cpu"). Defaults to None. - """ - - # Store the tokenizer for text processing - self.tokenizer = tokenizer - self.processor = processor - self.config = config - self.reward_fn = reward_fn - self.val_reward_fn = val_reward_fn - - self.hybrid_engine = config.actor_rollout_ref.hybrid_engine - assert self.hybrid_engine, "Currently, only support hybrid engine" - - if self.hybrid_engine: - assert Role.ActorRollout in role_worker_mapping, f"{role_worker_mapping.keys()=}" - - self.role_worker_mapping = role_worker_mapping - self.resource_pool_manager = resource_pool_manager - self.use_reference_policy = Role.RefPolicy in role_worker_mapping - self.use_rm = Role.RewardModel in role_worker_mapping - self.ray_worker_group_cls = ray_worker_group_cls - self.device_name = device_name if device_name else self.config.trainer.device - self.validation_generations_logger = ValidationGenerationsLogger( - project_name=self.config.trainer.project_name, - experiment_name=self.config.trainer.experiment_name, - ) - - # if ref_in_actor is True, the reference policy will be actor without lora applied - self.ref_in_actor = config.actor_rollout_ref.model.get("lora_rank", 0) > 0 - - # define in-reward KL control - # kl loss control currently not suppoorted - if self.config.algorithm.use_kl_in_reward: - self.kl_ctrl_in_reward = core_algos.get_kl_controller(self.config.algorithm.kl_ctrl) - - if config.critic.enable is not None: - self.use_critic = bool(config.critic.enable) - elif self.config.algorithm.adv_estimator == AdvantageEstimator.GAE: - self.use_critic = True - else: - warnings.warn( - "Disabled critic as algorithm.adv_estimator != gae. " - "If it is not intended, please set critic.enable=True", - stacklevel=2, - ) - self.use_critic = False - - self._validate_config() - self._create_dataloader(train_dataset, val_dataset, collate_fn, train_sampler) - - def _validate_config(self): - config = self.config - # number of GPUs total - n_gpus = config.trainer.n_gpus_per_node * config.trainer.nnodes - if config.actor_rollout_ref.actor.strategy == "megatron": - model_parallel_size = ( - config.actor_rollout_ref.actor.megatron.tensor_model_parallel_size - * config.actor_rollout_ref.actor.megatron.pipeline_model_parallel_size - ) - assert ( - n_gpus % (model_parallel_size * config.actor_rollout_ref.actor.megatron.context_parallel_size) == 0 - ), ( - f"n_gpus ({n_gpus}) must be divisible by model_parallel_size ({model_parallel_size}) times " - f"context_parallel_size ({config.actor_rollout_ref.actor.megatron.context_parallel_size})" - ) - megatron_dp = n_gpus // ( - model_parallel_size * config.actor_rollout_ref.actor.megatron.context_parallel_size - ) - self.minimal_bsz = megatron_dp * config.actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu - else: - self.minimal_bsz = n_gpus - - # 1. Check total batch size for data correctness - real_train_batch_size = config.data.train_batch_size * config.actor_rollout_ref.rollout.n - assert real_train_batch_size % self.minimal_bsz == 0, ( - f"real_train_batch_size ({real_train_batch_size}) must be divisible by minimal possible batch size " - f"({self.minimal_bsz})" - ) - - # A helper function to check "micro_batch_size" vs "micro_batch_size_per_gpu" - # We throw an error if the user sets both. The new convention is "..._micro_batch_size_per_gpu". - def check_mutually_exclusive(mbs, mbs_per_gpu, name: str): - """Validate mutually exclusive micro batch size configuration options. - - Ensures that users don't set both deprecated micro_batch_size and - the new micro_batch_size_per_gpu parameters simultaneously. - - Args: - mbs: Deprecated micro batch size parameter value. - mbs_per_gpu: New micro batch size per GPU parameter value. - name (str): Configuration section name for error messages. - - Raises: - ValueError: If both parameters are set or neither is set. - """ - settings = { - "reward_model": "micro_batch_size", - "actor_rollout_ref.ref": "log_prob_micro_batch_size", - "actor_rollout_ref.rollout": "log_prob_micro_batch_size", - } - - if name in settings: - param = settings[name] - param_per_gpu = f"{param}_per_gpu" - - if mbs is None and mbs_per_gpu is None: - raise ValueError( - f"[{name}] Please set at least one of '{name}.{param}' or '{name}.{param_per_gpu}'." - ) - - if mbs is not None and mbs_per_gpu is not None: - raise ValueError( - f"[{name}] You have set both '{name}.{param}' AND '{name}.{param_per_gpu}'. Please remove " - f"'{name}.{param}' because only '*_{param_per_gpu}' is supported (the former is deprecated)." - ) - - # Actor validation done in ActorConfig.__post_init__ and validate() - actor_config = omega_conf_to_dataclass(config.actor_rollout_ref.actor) - actor_config.validate(n_gpus, config.data.train_batch_size, config.actor_rollout_ref.model) - - if not config.actor_rollout_ref.actor.use_dynamic_bsz: - if self.use_reference_policy: - # reference: log_prob_micro_batch_size vs. log_prob_micro_batch_size_per_gpu - check_mutually_exclusive( - config.actor_rollout_ref.ref.log_prob_micro_batch_size, - config.actor_rollout_ref.ref.log_prob_micro_batch_size_per_gpu, - "actor_rollout_ref.ref", - ) - - # The rollout section also has log_prob_micro_batch_size vs. log_prob_micro_batch_size_per_gpu - check_mutually_exclusive( - config.actor_rollout_ref.rollout.log_prob_micro_batch_size, - config.actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu, - "actor_rollout_ref.rollout", - ) - - # Check for reward model micro-batch size conflicts - if config.reward_model.enable and not config.reward_model.use_dynamic_bsz: - check_mutually_exclusive( - config.reward_model.micro_batch_size, config.reward_model.micro_batch_size_per_gpu, "reward_model" - ) - - if self.config.algorithm.use_kl_in_reward and config.actor_rollout_ref.actor.use_kl_loss: - print("NOTICE: You have both enabled in-reward kl and kl loss.") - - # critic - if self.use_critic: - critic_config = omega_conf_to_dataclass(config.critic) - critic_config.validate(n_gpus, config.data.train_batch_size) - - if config.data.get("val_batch_size", None) is not None: - print( - "WARNING: val_batch_size is deprecated." - + " Validation datasets are sent to inference engines as a whole batch," - + " which will schedule the memory themselves." - ) - - # check eval config - if config.actor_rollout_ref.rollout.val_kwargs.do_sample: - assert config.actor_rollout_ref.rollout.temperature > 0, ( - "validation gen temperature should be greater than 0 when enabling do_sample" - ) - - print("[validate_config] All configuration checks passed successfully!") - - def _create_dataloader(self, train_dataset, val_dataset, collate_fn, train_sampler: Optional[Sampler]): - """ - Creates the train and validation dataloaders. - """ - # TODO: we have to make sure the batch size is divisible by the dp size - from verl.trainer.main_ppo import create_rl_dataset, create_rl_sampler - - if train_dataset is None: - train_dataset = create_rl_dataset( - self.config.data.train_files, self.config.data, self.tokenizer, self.processor - ) - if val_dataset is None: - val_dataset = create_rl_dataset( - self.config.data.val_files, self.config.data, self.tokenizer, self.processor - ) - self.train_dataset, self.val_dataset = train_dataset, val_dataset - - if train_sampler is None: - train_sampler = create_rl_sampler(self.config.data, self.train_dataset) - if collate_fn is None: - from verl.utils.dataset.rl_dataset import collate_fn as default_collate_fn - - collate_fn = default_collate_fn - - num_workers = self.config.data["dataloader_num_workers"] - - self.train_dataloader = StatefulDataLoader( - dataset=self.train_dataset, - batch_size=self.config.data.get("gen_batch_size", self.config.data.train_batch_size), - num_workers=num_workers, - drop_last=True, - collate_fn=collate_fn, - sampler=train_sampler, - ) - - val_batch_size = self.config.data.val_batch_size # Prefer config value if set - if val_batch_size is None: - val_batch_size = len(self.val_dataset) - - self.val_dataloader = StatefulDataLoader( - dataset=self.val_dataset, - batch_size=val_batch_size, - num_workers=num_workers, - shuffle=self.config.data.get("validation_shuffle", True), - drop_last=False, - collate_fn=collate_fn, - ) - - assert len(self.train_dataloader) >= 1, "Train dataloader is empty!" - assert len(self.val_dataloader) >= 1, "Validation dataloader is empty!" - - print( - f"Size of train dataloader: {len(self.train_dataloader)}, Size of val dataloader: " - f"{len(self.val_dataloader)}" - ) - - total_training_steps = len(self.train_dataloader) * self.config.trainer.total_epochs - - if self.config.trainer.total_training_steps is not None: - total_training_steps = self.config.trainer.total_training_steps - - self.total_training_steps = total_training_steps - print(f"Total training steps: {self.total_training_steps}") - - try: - OmegaConf.set_struct(self.config, True) - with open_dict(self.config): - if OmegaConf.select(self.config, "actor_rollout_ref.actor.optim"): - self.config.actor_rollout_ref.actor.optim.total_training_steps = total_training_steps - if OmegaConf.select(self.config, "critic.optim"): - self.config.critic.optim.total_training_steps = total_training_steps - except Exception as e: - print(f"Warning: Could not set total_training_steps in config. Structure missing? Error: {e}") - - def _dump_generations(self, inputs, outputs, scores, reward_extra_infos_dict, dump_path): - """Dump rollout/validation samples as JSONL.""" - os.makedirs(dump_path, exist_ok=True) - filename = os.path.join(dump_path, f"{self.global_steps}.jsonl") - - n = len(inputs) - base_data = { - "input": inputs, - "output": outputs, - "score": scores, - "step": [self.global_steps] * n, - } - - for k, v in reward_extra_infos_dict.items(): - if len(v) == n: - base_data[k] = v - - lines = [] - for i in range(n): - entry = {k: v[i] for k, v in base_data.items()} - lines.append(json.dumps(entry, ensure_ascii=False)) - - with open(filename, "w") as f: - f.write("\n".join(lines) + "\n") - - print(f"Dumped generations to {filename}") - - def _maybe_log_val_generations(self, inputs, outputs, scores): - """Log a table of validation samples to the configured logger (wandb or swanlab)""" - - generations_to_log = self.config.trainer.log_val_generations - - if generations_to_log == 0: - return - - import numpy as np - - # Create tuples of (input, output, score) and sort by input text - samples = list(zip(inputs, outputs, scores, strict=True)) - samples.sort(key=lambda x: x[0]) # Sort by input text +from verl.utils.rollout_skip import RolloutSkip - # Use fixed random seed for deterministic shuffling - rng = np.random.RandomState(42) - rng.shuffle(samples) - # Take first N samples after shuffling - samples = samples[:generations_to_log] - - # Log to each configured logger - self.validation_generations_logger.log(self.config.trainer.logger, samples, self.global_steps) - - def _validate(self): - data_source_lst = [] - reward_extra_infos_dict: dict[str, list] = defaultdict(list) - - # Lists to collect samples for the table - sample_inputs = [] - sample_outputs = [] - sample_scores = [] - sample_turns = [] - - for test_data in self.val_dataloader: - test_batch = DataProto.from_single_dict(test_data) - - # repeat test batch - test_batch = test_batch.repeat( - repeat_times=self.config.actor_rollout_ref.rollout.val_kwargs.n, interleave=True - ) - - # we only do validation on rule-based rm - if self.config.reward_model.enable and test_batch[0].non_tensor_batch["reward_model"]["style"] == "model": - return {} - - # Store original inputs - input_ids = test_batch.batch["input_ids"] - # TODO: Can we keep special tokens except for padding tokens? - input_texts = [self.tokenizer.decode(ids, skip_special_tokens=True) for ids in input_ids] - sample_inputs.extend(input_texts) - - batch_keys_to_pop = ["input_ids", "attention_mask", "position_ids"] - non_tensor_batch_keys_to_pop = ["raw_prompt_ids"] - if "multi_modal_data" in test_batch.non_tensor_batch: - non_tensor_batch_keys_to_pop.append("multi_modal_data") - if "raw_prompt" in test_batch.non_tensor_batch: - non_tensor_batch_keys_to_pop.append("raw_prompt") - if "tools_kwargs" in test_batch.non_tensor_batch: - non_tensor_batch_keys_to_pop.append("tools_kwargs") - if "interaction_kwargs" in test_batch.non_tensor_batch: - non_tensor_batch_keys_to_pop.append("interaction_kwargs") - if "agent_name" in test_batch.non_tensor_batch: - non_tensor_batch_keys_to_pop.append("agent_name") - test_gen_batch = test_batch.pop( - batch_keys=batch_keys_to_pop, - non_tensor_batch_keys=non_tensor_batch_keys_to_pop, - ) - - test_gen_batch.meta_info = { - "eos_token_id": self.tokenizer.eos_token_id, - "pad_token_id": self.tokenizer.pad_token_id, - "recompute_log_prob": False, - "do_sample": self.config.actor_rollout_ref.rollout.val_kwargs.do_sample, - "validate": True, - "global_steps": self.global_steps, - } - print(f"test_gen_batch meta info: {test_gen_batch.meta_info}") - - # pad to be divisible by dp_size - size_divisor = ( - self.actor_rollout_wg.world_size - if not self.async_rollout_mode - else self.config.actor_rollout_ref.rollout.agent.num_workers - ) - test_gen_batch_padded, pad_size = pad_dataproto_to_divisor(test_gen_batch, size_divisor) - if not self.async_rollout_mode: - test_output_gen_batch_padded = self.actor_rollout_wg.generate_sequences(test_gen_batch_padded) - else: - test_output_gen_batch_padded = self.async_rollout_manager.generate_sequences(test_gen_batch_padded) - - # unpad - test_output_gen_batch = unpad_dataproto(test_output_gen_batch_padded, pad_size=pad_size) - - print("validation generation end") - - # Store generated outputs - output_ids = test_output_gen_batch.batch["responses"] - output_texts = [self.tokenizer.decode(ids, skip_special_tokens=True) for ids in output_ids] - sample_outputs.extend(output_texts) - - test_batch = test_batch.union(test_output_gen_batch) - test_batch.meta_info["validate"] = True - - # evaluate using reward_function - result = self.val_reward_fn(test_batch, return_dict=True) - reward_tensor = result["reward_tensor"] - scores = reward_tensor.sum(-1).cpu().tolist() - sample_scores.extend(scores) - - reward_extra_infos_dict["reward"].extend(scores) - print(f"len reward_extra_infos_dict['reward']: {len(reward_extra_infos_dict['reward'])}") - if "reward_extra_info" in result: - for key, lst in result["reward_extra_info"].items(): - reward_extra_infos_dict[key].extend(lst) - print(f"len reward_extra_infos_dict['{key}']: {len(reward_extra_infos_dict[key])}") - - # collect num_turns of each prompt - if "__num_turns__" in test_batch.non_tensor_batch: - sample_turns.append(test_batch.non_tensor_batch["__num_turns__"]) - - data_source_lst.append(test_batch.non_tensor_batch.get("data_source", ["unknown"] * reward_tensor.shape[0])) - - self._maybe_log_val_generations(inputs=sample_inputs, outputs=sample_outputs, scores=sample_scores) - - # dump generations - val_data_dir = self.config.trainer.get("validation_data_dir", None) - if val_data_dir: - self._dump_generations( - inputs=sample_inputs, - outputs=sample_outputs, - scores=sample_scores, - reward_extra_infos_dict=reward_extra_infos_dict, - dump_path=val_data_dir, - ) - - for key_info, lst in reward_extra_infos_dict.items(): - assert len(lst) == 0 or len(lst) == len(sample_scores), f"{key_info}: {len(lst)=}, {len(sample_scores)=}" - - data_sources = np.concatenate(data_source_lst, axis=0) - - data_src2var2metric2val = process_validation_metrics(data_sources, sample_inputs, reward_extra_infos_dict) - metric_dict = {} - for data_source, var2metric2val in data_src2var2metric2val.items(): - core_var = "acc" if "acc" in var2metric2val else "reward" - for var_name, metric2val in var2metric2val.items(): - n_max = max([int(name.split("@")[-1].split("/")[0]) for name in metric2val.keys()]) - for metric_name, metric_val in metric2val.items(): - if ( - (var_name == core_var) - and any(metric_name.startswith(pfx) for pfx in ["mean", "maj", "best"]) - and (f"@{n_max}" in metric_name) - ): - metric_sec = "val-core" - else: - metric_sec = "val-aux" - pfx = f"{metric_sec}/{data_source}/{var_name}/{metric_name}" - metric_dict[pfx] = metric_val - - if len(sample_turns) > 0: - sample_turns = np.concatenate(sample_turns) - metric_dict["val-aux/num_turns/min"] = sample_turns.min() - metric_dict["val-aux/num_turns/max"] = sample_turns.max() - metric_dict["val-aux/num_turns/mean"] = sample_turns.mean() - - return metric_dict +class FullyAsyncRayPPOTrainer(RayPPOTrainer): + def __init__(self, *args, **kwargs): + pass def init_workers(self): """Initialize distributed training workers using Ray backend. @@ -818,6 +69,7 @@ def init_workers(self): def _init_resource_pools(self): self.resource_pool_manager.create_resource_pool() + self.resource_pool_to_cls = {pool: {} for pool in self.resource_pool_manager.resource_pool_dict.values()} def _create_worker_classes(self): @@ -878,14 +130,17 @@ def _init_worker_groups(self): wg_kwargs = {} # Setting up kwargs for RayWorkerGroup if OmegaConf.select(self.config.trainer, "ray_wait_register_center_timeout") is not None: wg_kwargs["ray_wait_register_center_timeout"] = self.config.trainer.ray_wait_register_center_timeout - if OmegaConf.select(self.config.trainer, "profile_steps") is not None: - wg_kwargs["profile_steps"] = OmegaConf.select(self.config.trainer, "profile_steps") - assert OmegaConf.select(self.config.trainer, "worker_nsight_options") is not None, ( - "worker_nsight_options must be set when profile_steps is set" - ) - wg_kwargs["worker_nsight_options"] = OmegaConf.to_container( - OmegaConf.select(self.config.trainer, "worker_nsight_options") - ) + if OmegaConf.select(self.config.global_profiler, "steps") is not None: + wg_kwargs["profile_steps"] = OmegaConf.select(self.config.global_profiler, "steps") + # Only require nsight worker options when tool is nsys + if OmegaConf.select(self.config.global_profiler, "tool") == "nsys": + assert ( + OmegaConf.select(self.config.global_profiler.global_tool_config.nsys, "worker_nsight_options") + is not None + ), "worker_nsight_options must be set when using nsys with profile_steps" + wg_kwargs["worker_nsight_options"] = OmegaConf.to_container( + OmegaConf.select(self.config.global_profiler.global_tool_config.nsys, "worker_nsight_options") + ) wg_kwargs["device_name"] = self.device_name for resource_pool, class_dict in self.resource_pool_to_cls.items(): @@ -920,170 +175,14 @@ def _init_async_rollout_manager(self): # create async rollout manager and request scheduler self.async_rollout_mode = False if self.config.actor_rollout_ref.rollout.mode == "async": - from recipe.fully_async_policy.agent_loop.agent_loop import AgentLoopManager + from recipe.fully_async_policy.agent_loop.agent_loop import FullyAgentLoopManager self.async_rollout_mode = True - self.async_rollout_manager = AgentLoopManager( + self.async_rollout_manager = FullyAgentLoopManager( config=self.config, worker_group=self.actor_rollout_wg, ) - def _save_checkpoint(self): - from verl.utils.fs import local_mkdir_safe - - # path: given_path + `/global_step_{global_steps}` + `/actor` - local_global_step_folder = os.path.join( - self.config.trainer.default_local_dir, f"global_step_{self.global_steps}" - ) - - print(f"local_global_step_folder: {local_global_step_folder}") - actor_local_path = os.path.join(local_global_step_folder, "actor") - - actor_remote_path = ( - None - if self.config.trainer.default_hdfs_dir is None - else os.path.join(self.config.trainer.default_hdfs_dir, f"global_step_{self.global_steps}", "actor") - ) - - remove_previous_ckpt_in_save = self.config.trainer.get("remove_previous_ckpt_in_save", False) - if remove_previous_ckpt_in_save: - print( - "Warning: remove_previous_ckpt_in_save is deprecated," - + " set max_actor_ckpt_to_keep=1 and max_critic_ckpt_to_keep=1 instead" - ) - max_actor_ckpt_to_keep = ( - self.config.trainer.get("max_actor_ckpt_to_keep", None) if not remove_previous_ckpt_in_save else 1 - ) - max_critic_ckpt_to_keep = ( - self.config.trainer.get("max_critic_ckpt_to_keep", None) if not remove_previous_ckpt_in_save else 1 - ) - - self.actor_rollout_wg.save_checkpoint( - actor_local_path, actor_remote_path, self.global_steps, max_ckpt_to_keep=max_actor_ckpt_to_keep - ) - - if self.use_critic: - critic_local_path = os.path.join(local_global_step_folder, "critic") - critic_remote_path = ( - None - if self.config.trainer.default_hdfs_dir is None - else os.path.join(self.config.trainer.default_hdfs_dir, f"global_step_{self.global_steps}", "critic") - ) - self.critic_wg.save_checkpoint( - critic_local_path, critic_remote_path, self.global_steps, max_ckpt_to_keep=max_critic_ckpt_to_keep - ) - - # save dataloader - local_mkdir_safe(local_global_step_folder) - dataloader_local_path = os.path.join(local_global_step_folder, "data.pt") - dataloader_state_dict = self.train_dataloader.state_dict() - torch.save(dataloader_state_dict, dataloader_local_path) - - # latest checkpointed iteration tracker (for atomic usage) - local_latest_checkpointed_iteration = os.path.join( - self.config.trainer.default_local_dir, "latest_checkpointed_iteration.txt" - ) - with open(local_latest_checkpointed_iteration, "w") as f: - f.write(str(self.global_steps)) - - def _load_checkpoint(self): - if self.config.trainer.resume_mode == "disable": - return 0 - - # load from hdfs - if self.config.trainer.default_hdfs_dir is not None: - raise NotImplementedError("load from hdfs is not implemented yet") - else: - checkpoint_folder = self.config.trainer.default_local_dir # TODO: check path - if not os.path.isabs(checkpoint_folder): - working_dir = os.getcwd() - checkpoint_folder = os.path.join(working_dir, checkpoint_folder) - global_step_folder = find_latest_ckpt_path(checkpoint_folder) # None if no latest - - # find global_step_folder - if self.config.trainer.resume_mode == "auto": - if global_step_folder is None: - print("Training from scratch") - return 0 - else: - if self.config.trainer.resume_mode == "resume_path": - assert isinstance(self.config.trainer.resume_from_path, str), "resume ckpt must be str type" - assert "global_step_" in self.config.trainer.resume_from_path, ( - "resume ckpt must specify the global_steps" - ) - global_step_folder = self.config.trainer.resume_from_path - if not os.path.isabs(global_step_folder): - working_dir = os.getcwd() - global_step_folder = os.path.join(working_dir, global_step_folder) - print(f"Load from checkpoint folder: {global_step_folder}") - # set global step - self.global_steps = int(global_step_folder.split("global_step_")[-1]) - - print(f"Setting global step to {self.global_steps}") - print(f"Resuming from {global_step_folder}") - - actor_path = os.path.join(global_step_folder, "actor") - critic_path = os.path.join(global_step_folder, "critic") - # load actor - self.actor_rollout_wg.load_checkpoint( - actor_path, del_local_after_load=self.config.trainer.del_local_ckpt_after_load - ) - # load critic - if self.use_critic: - self.critic_wg.load_checkpoint( - critic_path, del_local_after_load=self.config.trainer.del_local_ckpt_after_load - ) - - # load dataloader, - # TODO: from remote not implemented yet - dataloader_local_path = os.path.join(global_step_folder, "data.pt") - if os.path.exists(dataloader_local_path): - dataloader_state_dict = torch.load(dataloader_local_path, weights_only=False) - self.train_dataloader.load_state_dict(dataloader_state_dict) - else: - print(f"Warning: No dataloader state found at {dataloader_local_path}, will start from scratch") - - def _start_profiling(self, do_profile: bool, timing_raw) -> None: - """Start profiling for all worker groups if profiling is enabled.""" - with marked_timer("start_profile", timing_raw): - if do_profile: - self.actor_rollout_wg.start_profile(role="e2e", profile_step=self.global_steps) - if self.use_reference_policy: - self.ref_policy_wg.start_profile() - if self.use_critic: - self.critic_wg.start_profile() - if self.use_rm: - self.rm_wg.start_profile() - - def _stop_profiling(self, do_profile: bool, timing_raw) -> None: - """Stop profiling for all worker groups if profiling is enabled.""" - with marked_timer("stop_profile", timing_raw): - if do_profile: - self.actor_rollout_wg.stop_profile() - if self.use_reference_policy: - self.ref_policy_wg.stop_profile() - if self.use_critic: - self.critic_wg.stop_profile() - if self.use_rm: - self.rm_wg.stop_profile() - - def _balance_batch(self, batch: DataProto, metrics, logging_prefix="global_seqlen"): - """Reorder the data on single controller such that each dp rank gets similar total tokens""" - attention_mask = batch.batch["attention_mask"] - batch_size = attention_mask.shape[0] - global_seqlen_lst = batch.batch["attention_mask"].view(batch_size, -1).sum(-1).tolist() # (train_batch_size,) - world_size = self.actor_rollout_wg.world_size - global_partition_lst = get_seqlen_balanced_partitions( - global_seqlen_lst, k_partitions=world_size, equal_size=True - ) - # reorder based on index. The data will be automatically equally partitioned by dispatch function - global_idx = torch.tensor([j for partition in global_partition_lst for j in partition]) - batch.reorder(global_idx) - global_balance_stats = log_seqlen_unbalance( - seqlen_list=global_seqlen_lst, partitions=global_partition_lst, prefix=logging_prefix - ) - metrics.update(global_balance_stats) - def fit(self): """ The training loop of PPO. @@ -1117,6 +216,10 @@ def fit(self): if self.config.trainer.get("val_only", False): return + if self.config.actor_rollout_ref.rollout.get("skip_rollout", False): + rollout_skip = RolloutSkip(self.config, self.actor_rollout_wg) + rollout_skip.wrap_generate_sequences() + # add tqdm progress_bar = tqdm(total=self.total_training_steps, initial=self.global_steps, desc="Training Progress") @@ -1125,17 +228,25 @@ def fit(self): last_val_metrics = None self.max_steps_duration = 0 + prev_step_profile = False + curr_step_profile = ( + self.global_steps in self.config.global_profiler.steps + if self.config.global_profiler.steps is not None + else False + ) + next_step_profile = False + for epoch in range(self.config.trainer.total_epochs): for batch_dict in self.train_dataloader: metrics = {} timing_raw = {} - do_profile = ( - self.global_steps in self.config.trainer.profile_steps - if self.config.trainer.profile_steps is not None - else False - ) - self._start_profiling(do_profile, timing_raw) + with marked_timer("start_profile", timing_raw): + self._start_profiling( + not prev_step_profile and curr_step_profile + if self.config.global_profiler.profile_continuous_steps + else curr_step_profile + ) batch, gen_batch = self._prepare_generate_batch(batch_dict) @@ -1152,6 +263,9 @@ def fit(self): gen_batch_output.meta_info.pop("timing", None) if self.config.algorithm.adv_estimator == AdvantageEstimator.REMAX: + if self.reward_fn is None: + raise ValueError("A reward_fn is required for REMAX advantage estimation.") + with marked_timer("gen_max", timing_raw, color="purple"): gen_baseline_batch = deepcopy(gen_batch) gen_baseline_batch.meta_info["do_sample"] = False @@ -1172,10 +286,24 @@ def fit(self): batch = self._post_generate_batch(batch, gen_batch_output, metrics) batch, reward_extra_infos_dict = self._process_batch_common(batch, metrics, timing_raw) self._log_rollout(batch, reward_extra_infos_dict, timing_raw) - last_val_metrics = self._validate_metrics(is_last_step, last_val_metrics, metrics, timing_raw) - self._check_save_checkpoint(is_last_step, timing_raw) - self._stop_profiling(do_profile, timing_raw) + last_val_metrics = self._validate_metrics(is_last_step, last_val_metrics, metrics, timing_raw) + self._check_save_checkpoint(is_last_step, timing_raw) + + with marked_timer("stop_profile", timing_raw): + next_step_profile = ( + self.global_steps + 1 in self.config.global_profiler.steps + if self.config.global_profiler.steps is not None + else False + ) + self._stop_profiling( + curr_step_profile and not next_step_profile + if self.config.global_profiler.profile_continuous_steps + else curr_step_profile + ) + prev_step_profile = curr_step_profile + curr_step_profile = next_step_profile + self._collect_metrics(batch, epoch, metrics, timing_raw) self._post_batch_processing(batch) @@ -1185,6 +313,14 @@ def fit(self): progress_bar.update(1) self.global_steps += 1 + if ( + hasattr(self.config.actor_rollout_ref.actor, "profiler") + and self.config.actor_rollout_ref.actor.profiler.tool == "torch_memory" + ): + self.actor_rollout_wg.dump_memory_snapshot( + tag=f"post_update_step{self.global_steps}", sub_dir=f"step{self.global_steps}" + ) + if is_last_step: pprint(f"Final validation metrics: {last_val_metrics}") progress_bar.close() @@ -1192,35 +328,22 @@ def fit(self): def _prepare_generate_batch(self, batch_dict): batch: DataProto = DataProto.from_single_dict(batch_dict) - # pop those keys for generation - batch_keys_to_pop = ["input_ids", "attention_mask", "position_ids"] - non_tensor_batch_keys_to_pop = ["raw_prompt_ids"] - if "multi_modal_data" in batch.non_tensor_batch: - non_tensor_batch_keys_to_pop.append("multi_modal_data") - if "raw_prompt" in batch.non_tensor_batch: - non_tensor_batch_keys_to_pop.append("raw_prompt") - if "tools_kwargs" in batch.non_tensor_batch: - non_tensor_batch_keys_to_pop.append("tools_kwargs") - if "interaction_kwargs" in batch.non_tensor_batch: - non_tensor_batch_keys_to_pop.append("interaction_kwargs") - if "index" in batch.non_tensor_batch: - non_tensor_batch_keys_to_pop.append("index") - if "agent_name" in batch.non_tensor_batch: - non_tensor_batch_keys_to_pop.append("agent_name") - gen_batch = batch.pop( - batch_keys=batch_keys_to_pop, - non_tensor_batch_keys=non_tensor_batch_keys_to_pop, - ) + + # add uid to batch + batch.non_tensor_batch["uid"] = np.array([str(uuid.uuid4()) for _ in range(len(batch.batch))], dtype=object) + + gen_batch = self._get_gen_batch(batch) + # pass global_steps to trace gen_batch.meta_info["global_steps"] = self.global_steps gen_batch = gen_batch.repeat(repeat_times=self.config.actor_rollout_ref.rollout.n, interleave=True) return batch, gen_batch def _post_generate_batch(self, batch, gen_batch_output, metrics): - batch.non_tensor_batch["uid"] = np.array([str(uuid.uuid4()) for _ in range(len(batch.batch))], dtype=object) # repeat to align with repeated responses in rollout batch = batch.repeat(repeat_times=self.config.actor_rollout_ref.rollout.n, interleave=True) batch = batch.union(gen_batch_output) + if "response_mask" not in batch.batch.keys(): batch.batch["response_mask"] = compute_response_mask(batch) # Balance the number of valid tokens across DP ranks. @@ -1230,8 +353,10 @@ def _post_generate_batch(self, batch, gen_batch_output, metrics): # TODO: Decouple the DP balancing and mini-batching. if self.config.trainer.balance_batch: self._balance_batch(batch, metrics=metrics) + # compute global_valid tokens batch.meta_info["global_token_num"] = torch.sum(batch.batch["attention_mask"], dim=-1).tolist() + return batch def _process_batch_common(self, batch, metrics, timing_raw): @@ -1245,6 +370,7 @@ def _process_batch_common(self, batch, metrics, timing_raw): future_reward = compute_reward_async.remote(data=batch, reward_fn=self.reward_fn) else: reward_tensor, reward_extra_infos_dict = compute_reward(batch, self.reward_fn) + # recompute old_log_probs with marked_timer("old_log_prob", timing_raw, color="blue"): async_training = self.config.get("async_training", None) @@ -1265,27 +391,9 @@ def _process_batch_common(self, batch, metrics, timing_raw): if "rollout_log_probs" in batch.batch.keys(): # TODO: we may want to add diff of probs too. - rollout_old_log_probs = batch.batch["rollout_log_probs"] - actor_old_log_probs = batch.batch["old_log_probs"] - attention_mask = batch.batch["attention_mask"] - responses = batch.batch["responses"] - response_length = responses.size(1) - response_mask = attention_mask[:, -response_length:] - - rollout_probs = torch.exp(rollout_old_log_probs) - actor_probs = torch.exp(actor_old_log_probs) - rollout_probs_diff = torch.abs(rollout_probs - actor_probs) - rollout_probs_diff = torch.masked_select(rollout_probs_diff, response_mask.bool()) - rollout_probs_diff_max = torch.max(rollout_probs_diff) - rollout_probs_diff_mean = torch.mean(rollout_probs_diff) - rollout_probs_diff_std = torch.std(rollout_probs_diff) - metrics.update( - { - "training/rollout_probs_diff_max": rollout_probs_diff_max.detach().item(), - "training/rollout_probs_diff_mean": rollout_probs_diff_mean.detach().item(), - "training/rollout_probs_diff_std": rollout_probs_diff_std.detach().item(), - } - ) + from verl.utils.debug.metrics import calculate_debug_metrics + + metrics.update(calculate_debug_metrics(batch)) if self.use_reference_policy: # compute reference log_prob @@ -1295,11 +403,13 @@ def _process_batch_common(self, batch, metrics, timing_raw): else: ref_log_prob = self.actor_rollout_wg.compute_ref_log_prob(batch) batch = batch.union(ref_log_prob) + # compute values if self.use_critic: with marked_timer("values", timing_raw, color="cyan"): values = self.critic_wg.compute_values(batch) batch = batch.union(values) + with marked_timer("adv", timing_raw, color="brown"): # we combine with rule-based rm reward_extra_infos_dict: dict[str, list] @@ -1334,12 +444,14 @@ def _process_batch_common(self, batch, metrics, timing_raw): norm_adv_by_std_in_grpo=norm_adv_by_std_in_grpo, config=self.config.algorithm, ) + # update critic if self.use_critic: with marked_timer("update_critic", timing_raw, color="pink"): critic_output = self.critic_wg.update_critic(batch) critic_output_metrics = reduce_metrics(critic_output.meta_info["metrics"]) metrics.update(critic_output_metrics) + # implement critic warmup if self.config.trainer.critic_warmup <= self.global_steps: # update actor @@ -1351,21 +463,25 @@ def _process_batch_common(self, batch, metrics, timing_raw): return batch, reward_extra_infos_dict def _log_rollout(self, batch, reward_extra_infos_dict, timing_raw): - """Log rollout generations if enabled""" + # Log rollout generations if enabled rollout_data_dir = self.config.trainer.get("rollout_data_dir", None) if rollout_data_dir: with marked_timer("dump_rollout_generations", timing_raw, color="green"): inputs = self.tokenizer.batch_decode(batch.batch["prompts"], skip_special_tokens=True) outputs = self.tokenizer.batch_decode(batch.batch["responses"], skip_special_tokens=True) scores = batch.batch["token_level_scores"].sum(-1).cpu().tolist() + sample_gts = [item.non_tensor_batch.get("reward_model", {}).get("ground_truth", None) for item in batch] + if "request_id" in batch.non_tensor_batch: reward_extra_infos_dict.setdefault( "request_id", batch.non_tensor_batch["request_id"].tolist(), ) + self._dump_generations( inputs=inputs, outputs=outputs, + gts=sample_gts, scores=scores, reward_extra_infos_dict=reward_extra_infos_dict, dump_path=rollout_data_dir, @@ -1382,7 +498,7 @@ def _validate_metrics(self, is_last_step, last_val_metrics, metrics, timing_raw) if is_last_step: last_val_metrics = val_metrics metrics.update(val_metrics) - return last_val_metrics + return last_val_metrics def _check_save_checkpoint(self, is_last_step, timing_raw): # Check if the ESI (Elastic Server Instance)/training plan is close to expiration. @@ -1408,6 +524,7 @@ def _check_save_checkpoint(self, is_last_step, timing_raw): def _collect_metrics(self, batch, epoch, metrics, timing_raw): steps_duration = timing_raw["step"] self.max_steps_duration = max(self.max_steps_duration, steps_duration) + # training metrics metrics.update( { diff --git a/verl/experimental/agent_loop/__init__.py b/verl/experimental/agent_loop/__init__.py index 88d61ee4169..c9c1f5bd77d 100644 --- a/verl/experimental/agent_loop/__init__.py +++ b/verl/experimental/agent_loop/__init__.py @@ -12,10 +12,10 @@ # See the License for the specific language governing permissions and # limitations under the License. -from .agent_loop import AgentLoopBase, AgentLoopManager, AsyncLLMServerManager +# from .agent_loop import AgentLoopBase, AgentLoopManager, AsyncLLMServerManager, AgentLoopWorker from .single_turn_agent_loop import SingleTurnAgentLoop from .tool_agent_loop import ToolAgentLoop _ = [SingleTurnAgentLoop, ToolAgentLoop] -__all__ = ["AgentLoopBase", "AgentLoopManager", "AsyncLLMServerManager"] +# __all__ = ["AgentLoopBase", "AgentLoopManager", "AsyncLLMServerManager", "AgentLoopWorker"] diff --git a/verl/experimental/agent_loop/agent_loop.py b/verl/experimental/agent_loop/agent_loop.py index 1c8d0eac928..f520f89472a 100644 --- a/verl/experimental/agent_loop/agent_loop.py +++ b/verl/experimental/agent_loop/agent_loop.py @@ -179,12 +179,12 @@ class AgentLoopBase(ABC): _class_initialized = False def __init__( - self, - trainer_config: _DummyConfig, - server_manager: AsyncLLMServerManager, - tokenizer: AutoTokenizer, - processor: AutoProcessor, - **kwargs, + self, + trainer_config: _DummyConfig, + server_manager: AsyncLLMServerManager, + tokenizer: AutoTokenizer, + processor: AutoProcessor, + **kwargs, ): """Initialize agent loop, each sample will have its own loop instance. @@ -329,8 +329,8 @@ def __init__(self, config: DictConfig, local_path: str, rm_executor: BatchExecut self.rm_executor = rm_executor def compute_score( - self, - data: DataProto, + self, + data: DataProto, ) -> dict: """Compute reward score for agent loop output. @@ -355,7 +355,7 @@ class AgentLoopWorker: """Agent loop worker takes a batch of messages and run each message in an agent loop.""" def __init__( - self, config: DictConfig, server_handles: list[ray.actor.ActorHandle], rm_executor: BatchExecutor = None + self, config: DictConfig, server_handles: list[ray.actor.ActorHandle], rm_executor: BatchExecutor = None ): """Initialize agent loop manager. @@ -364,7 +364,11 @@ def __init__( server_handles (List[ray.actor.ActorHandle]): OpenAI compatible LLM server actor handles. """ self.config = config - self.server_manager = AsyncLLMServerManager(config, server_handles) + + if self.AsyncLLMServerManager == None: + self.AsyncLLMServerManager = AsyncLLMServerManager + + self.server_manager = self.AsyncLLMServerManager(config, server_handles) self.rm_executor = rm_executor model_path = config.actor_rollout_ref.model.path @@ -455,19 +459,19 @@ async def generate_sequences(self, batch: DataProto) -> DataProto: return output async def _run_agent_loop( - self, - sampling_params: dict[str, Any], - trajectory: dict[str, Any], - *, - agent_name: str, - **kwargs, + self, + sampling_params: dict[str, Any], + trajectory: dict[str, Any], + *, + agent_name: str, + **kwargs, ) -> _InternalAgentLoopOutput: with rollout_trace_attr( - step=trajectory["step"], - sample_index=trajectory["sample_index"], - rollout_n=trajectory["rollout_n"], - validate=trajectory["validate"], - name="agent_loop", + step=trajectory["step"], + sample_index=trajectory["sample_index"], + rollout_n=trajectory["rollout_n"], + validate=trajectory["validate"], + name="agent_loop", ): assert agent_name in _agent_loop_registry, ( f"Agent loop {agent_name} not registered, registered agent loops: {_agent_loop_registry.keys()}" @@ -550,8 +554,8 @@ async def _run_agent_loop( # TODO: support other multi-modal inputs multi_modal_inputs = None if ( - self.processor is not None - and "Qwen2VLImageProcessor" in self.processor.image_processor.__class__.__name__ + self.processor is not None + and "Qwen2VLImageProcessor" in self.processor.image_processor.__class__.__name__ ): from verl.models.transformers.qwen2_vl import get_rope_index @@ -580,8 +584,8 @@ async def _run_agent_loop( else: position_ids = compute_position_id_with_mask(attention_mask) # (1, seq_len) enable_async_reward = ( - self.rm_executor is not None and self.config.reward_model.enable_resource_pool - ) or not self.config.reward_model.enable + self.rm_executor is not None and self.config.reward_model.enable_resource_pool + ) or not self.config.reward_model.enable if output.reward_score is None and enable_async_reward: batch = TensorDict( { @@ -751,6 +755,9 @@ def batch_fn(data_list: list[DataProto]) -> list[torch.Tensor]: if self.config.actor_rollout_ref.rollout.free_cache_engine: self.sleep() + # for recipe to change AgentLoopWorker + self.AgentLoopWorker = AgentLoopWorker + def _initialize_llm_servers(self): rollout_world_size = self.config.actor_rollout_ref.rollout.tensor_model_parallel_size world_size = ( @@ -783,7 +790,7 @@ def _init_agent_loop_workers(self): # Round-robin scheduling over the all nodes node_id = node_ids[i % len(node_ids)] self.agent_loop_workers.append( - AgentLoopWorker.options( + self.AgentLoopWorker.options( name=f"agent_loop_worker_{i}", scheduling_strategy=ray.util.scheduling_strategies.NodeAffinitySchedulingStrategy( node_id=node_id, soft=True diff --git a/verl/trainer/main_ppo.py b/verl/trainer/main_ppo.py index 0a8e1f3d27b..e33d346e482 100644 --- a/verl/trainer/main_ppo.py +++ b/verl/trainer/main_ppo.py @@ -43,13 +43,14 @@ def main(config): # Define a function to run the PPO-like training process -def run_ppo(config) -> None: +def run_ppo(config, task_runner_class=None) -> None: """Initialize Ray cluster and run distributed PPO training process. Args: config: Training configuration object containing all necessary parameters for distributed PPO training including Ray initialization settings, model paths, and training hyperparameters. + task_runner_class: For recipe to change TaskRunner. """ # Check if Ray is not initialized if not ray.is_initialized(): @@ -65,6 +66,9 @@ def run_ppo(config) -> None: print(f"ray init kwargs: {ray_init_kwargs}") ray.init(**OmegaConf.to_container(ray_init_kwargs)) + if task_runner_class is None: + task_runner_class = TaskRunner + # Create a remote instance of the TaskRunner class, and # Execute the `run` method of the TaskRunner instance remotely and wait for it to complete if ( @@ -79,9 +83,9 @@ def run_ppo(config) -> None: nsight_options = OmegaConf.to_container( config.global_profiler.global_tool_config.nsys.controller_nsight_options ) - runner = TaskRunner.options(runtime_env={"nsight": nsight_options}).remote() + runner = task_runner_class.options(runtime_env={"nsight": nsight_options}).remote() else: - runner = TaskRunner.remote() + runner = task_runner_class.remote() ray.get(runner.run.remote(config)) # [Optional] get the path of the timeline trace file from the configuration, default to None diff --git a/verl/trainer/ppo/ray_trainer.py b/verl/trainer/ppo/ray_trainer.py index bb945c0451f..d1c58a67e27 100644 --- a/verl/trainer/ppo/ray_trainer.py +++ b/verl/trainer/ppo/ray_trainer.py @@ -642,9 +642,9 @@ def init_workers(self): actor_rollout_cls = RayClassWithInitArgs( cls=self.role_worker_mapping[Role.ActorRollout], config=self.config.actor_rollout_ref, - role="actor_rollout", + role=str(Role.ActorRollout), ) - self.resource_pool_to_cls[resource_pool]["actor_rollout"] = actor_rollout_cls + self.resource_pool_to_cls[resource_pool][str(Role.ActorRollout)] = actor_rollout_cls else: raise NotImplementedError @@ -653,7 +653,7 @@ def init_workers(self): resource_pool = self.resource_pool_manager.get_resource_pool(Role.Critic) critic_cfg = omega_conf_to_dataclass(self.config.critic) critic_cls = RayClassWithInitArgs(cls=self.role_worker_mapping[Role.Critic], config=critic_cfg) - self.resource_pool_to_cls[resource_pool]["critic"] = critic_cls + self.resource_pool_to_cls[resource_pool][str(Role.Critic)] = critic_cls # create reference policy if needed if self.use_reference_policy: @@ -661,16 +661,16 @@ def init_workers(self): ref_policy_cls = RayClassWithInitArgs( self.role_worker_mapping[Role.RefPolicy], config=self.config.actor_rollout_ref, - role="ref", + role=str(Role.RefPolicy), ) - self.resource_pool_to_cls[resource_pool]["ref"] = ref_policy_cls + self.resource_pool_to_cls[resource_pool][str(Role.RefPolicy)] = ref_policy_cls # create a reward model if reward_fn is None if self.use_rm: # we create a RM here resource_pool = self.resource_pool_manager.get_resource_pool(Role.RewardModel) rm_cls = RayClassWithInitArgs(self.role_worker_mapping[Role.RewardModel], config=self.config.reward_model) - self.resource_pool_to_cls[resource_pool]["rm"] = rm_cls + self.resource_pool_to_cls[resource_pool][str(Role.RewardModel)] = rm_cls # initialize WorkerGroup # NOTE: if you want to use a different resource pool for each role, which can support different parallel size, @@ -705,20 +705,20 @@ def init_workers(self): all_wg.update(spawn_wg) if self.use_critic: - self.critic_wg = all_wg["critic"] + self.critic_wg = all_wg[str(Role.Critic)] self.critic_wg.init_model() if self.use_reference_policy and not self.ref_in_actor: - self.ref_policy_wg = all_wg["ref"] + self.ref_policy_wg = all_wg[str(Role.RefPolicy)] self.ref_policy_wg.init_model() self.rm_wg = None if self.use_rm: - self.rm_wg = all_wg["rm"] + self.rm_wg = all_wg[str(Role.RewardModel)] self.rm_wg.init_model() # we should create rollout at the end so that vllm can have a better estimation of kv cache memory - self.actor_rollout_wg = all_wg["actor_rollout"] + self.actor_rollout_wg = all_wg[str(Role.ActorRollout)] self.actor_rollout_wg.init_model() # create async rollout manager and request scheduler @@ -766,11 +766,13 @@ def _save_checkpoint(self): ) if self.use_critic: - critic_local_path = os.path.join(local_global_step_folder, "critic") + critic_local_path = os.path.join(local_global_step_folder, str(Role.Critic)) critic_remote_path = ( None if self.config.trainer.default_hdfs_dir is None - else os.path.join(self.config.trainer.default_hdfs_dir, f"global_step_{self.global_steps}", "critic") + else os.path.join( + self.config.trainer.default_hdfs_dir, f"global_step_{self.global_steps}", str(Role.Critic) + ) ) self.critic_wg.save_checkpoint( critic_local_path, critic_remote_path, self.global_steps, max_ckpt_to_keep=max_critic_ckpt_to_keep @@ -826,7 +828,7 @@ def _load_checkpoint(self): print(f"Resuming from {global_step_folder}") actor_path = os.path.join(global_step_folder, "actor") - critic_path = os.path.join(global_step_folder, "critic") + critic_path = os.path.join(global_step_folder, str(Role.Critic)) # load actor self.actor_rollout_wg.load_checkpoint( actor_path, del_local_after_load=self.config.trainer.del_local_ckpt_after_load @@ -1044,7 +1046,7 @@ def fit(self): if self.use_reference_policy: # compute reference log_prob - with marked_timer("ref", timing_raw, color="olive"): + with marked_timer(str(Role.RefPolicy), timing_raw, color="olive"): if not self.ref_in_actor: ref_log_prob = self.ref_policy_wg.compute_ref_log_prob(batch) else: diff --git a/verl/trainer/ppo/utils.py b/verl/trainer/ppo/utils.py index 22d00a45052..31e886fd6f2 100644 --- a/verl/trainer/ppo/utils.py +++ b/verl/trainer/ppo/utils.py @@ -36,6 +36,37 @@ class Role(Enum): RewardModel = 5 ActorRolloutRef = 6 + def __str__(self): + return self._get_role_string() + + def _get_role_string(self): + role_mapping = { + Role.Actor: "actor", + Role.Rollout: "rollout", + Role.ActorRollout: "actor_rollout", + Role.Critic: "critic", + Role.RefPolicy: "ref", + Role.RewardModel: "rm", + Role.ActorRolloutRef: "actor_rollout_ref", + } + return role_mapping.get(self, self.name.lower()) + + @classmethod + def from_string(cls, name: str): + string_mapping = { + "actor": cls.Actor, + "rollout": cls.Rollout, + "actor_rollout": cls.ActorRollout, + "critic": cls.Critic, + "ref": cls.RefPolicy, + "rm": cls.RewardModel, + "actor_rollout_ref": cls.ActorRolloutRef, + } + role = string_mapping.get(name.lower()) + if role is None: + raise ValueError(f"No Role found for string: {name}") + return role + def need_reference_policy( role_worker_mapping: dict[Role, WorkerType], From 073e40f0848ddbf42ca6f469f6c81f7d323f29b5 Mon Sep 17 00:00:00 2001 From: wangshulin02 <953550366@qq.com> Date: Wed, 17 Sep 2025 10:49:53 +0800 Subject: [PATCH 141/182] cleaned up the fully_async metric, fix processing_time, add partial metric, add stale_trajectory_processed --- .../agent_loop/agent_loop.py | 18 +++++-- .../partial_single_turn_agent_loop.py | 17 ++++-- recipe/fully_async_policy/detach_utils.py | 45 ++++++++++------ .../fully_async_rollouter.py | 53 ++++++++++--------- .../fully_async_policy/fully_async_trainer.py | 43 ++++++++------- 5 files changed, 111 insertions(+), 65 deletions(-) diff --git a/recipe/fully_async_policy/agent_loop/agent_loop.py b/recipe/fully_async_policy/agent_loop/agent_loop.py index 4f4496c8999..b6433c0acd7 100644 --- a/recipe/fully_async_policy/agent_loop/agent_loop.py +++ b/recipe/fully_async_policy/agent_loop/agent_loop.py @@ -137,6 +137,10 @@ class AgentLoopOutput(BaseModel): """Indicates whether the request was interrupted""" log_probs: list[float] = None """Response token log probs including LLM generated token, tool response token.""" + param_version_start: int = 0 + """Indicate start parameter version when this response is generated""" + param_version_end: int = 0 + """Indicate end parameter version when this response is generated, used for partial rollout""" # make hydra.utils.instantiate happy @@ -381,7 +385,7 @@ async def generate_sequences(self, batch: DataProto) -> DataProto: return output async def generate_sequences_no_post( - self, batch: DataProto, partial_output_list: Optional[list[AgentLoopOutput]] + self, batch: DataProto, param_version: int, partial_output_list: Optional[list[AgentLoopOutput]] ) -> list[AgentLoopOutput]: """Generate sequences from agent loop. @@ -433,7 +437,9 @@ async def generate_sequences_no_post( ): tasks.append( asyncio.create_task( - self._run_agent_loop(agent_name, messages.tolist(), sampling_params, trajectory, partial_output) + self._run_agent_loop( + agent_name, messages.tolist(), sampling_params, trajectory, param_version, partial_output + ) ) ) outputs = await asyncio.gather(*tasks) @@ -446,6 +452,7 @@ async def _run_agent_loop( messages: list[dict[str, Any]], sampling_params: dict[str, Any], trajectory: dict[str, Any], + param_version: Optional[int] = None, partial_output: Optional[AgentLoopOutput] = None, ) -> AgentLoopOutput: with rollout_trace_attr( @@ -466,7 +473,7 @@ async def _run_agent_loop( tokenizer=self.tokenizer, ) if agent_name == "partial_single_turn_agent": - output = await agent_loop.run(messages, sampling_params, partial_output) + output = await agent_loop.run(messages, sampling_params, param_version, partial_output) else: output = await agent_loop.run(messages, sampling_params) return output @@ -602,6 +609,7 @@ def generate_sequences(self, prompts: DataProto) -> DataProto: async def generate_single_sample_async( self, sample: DataProto, + param_version: int, partial_output_list: Optional[list[AgentLoopOutput]], ) -> list[AgentLoopOutput]: """ @@ -617,7 +625,7 @@ async def generate_single_sample_async( # 使用负载均衡选择 worker worker = self._select_best_worker() # 异步处理单个样本 - 使用无后处理版本获取原始AgentLoopOutput - output_future = worker.generate_sequences_no_post.remote(sample, partial_output_list) + output_future = worker.generate_sequences_no_post.remote(sample, param_version, partial_output_list) return await asyncio.wrap_future(output_future.future()) def _select_best_worker(self): @@ -665,7 +673,7 @@ async def cancel_async(self): await asyncio.gather(*[asyncio.wrap_future(future.future()) for future in futures], return_exceptions=True) async def resume_async(self): - """Cancel all rollout tasks asynchronously.""" + """Resume all rollout tasks asynchronously.""" futures = [server.resume.remote() for server in self.async_llm_servers] await asyncio.gather(*[asyncio.wrap_future(future.future()) for future in futures], return_exceptions=True) diff --git a/recipe/fully_async_policy/agent_loop/partial_single_turn_agent_loop.py b/recipe/fully_async_policy/agent_loop/partial_single_turn_agent_loop.py index cf95c1eb965..c97f794bb9c 100644 --- a/recipe/fully_async_policy/agent_loop/partial_single_turn_agent_loop.py +++ b/recipe/fully_async_policy/agent_loop/partial_single_turn_agent_loop.py @@ -33,21 +33,30 @@ def __init__(self, *args, **kwargs): self.response_length = self.config.actor_rollout_ref.rollout.response_length async def run( - self, messages: list[dict[str, Any]], sampling_params: dict[str, Any], output: Optional[AgentLoopOutput] + self, + messages: list[dict[str, Any]], + sampling_params: dict[str, Any], + param_version: int, + output: Optional[AgentLoopOutput], ) -> AgentLoopOutput: + metrics = {} + param_version_start = None + param_version_end = None if not output: prompt_ids = await self.loop.run_in_executor( None, lambda: self.tokenizer.apply_chat_template(messages, add_generation_prompt=True, tokenize=True) ) + param_version_start = param_version else: if output.is_cancel: # 恢复暂停的样本,结果直接添加到 prompt_ids 后面 prompt_ids = output.prompt_ids + output.response_ids + metrics["generate_sequences"] = output.metrics.generate_sequences + param_version_start = output.param_version_start else: # 同一批样本,部分cancel,部分没有cancel, 没有cancel的样本直接返回 return output - - metrics = {} + param_version_end = param_version request_id = uuid4().hex with simple_timer("generate_sequences", metrics): response_ids, log_probs, is_cancel = await self.server_manager.generate_for_partial( @@ -71,4 +80,6 @@ async def run( metrics=metrics, is_cancel=is_cancel, log_probs=log_probs, + param_version_start=param_version_start, + param_version_end=param_version_end, ) diff --git a/recipe/fully_async_policy/detach_utils.py b/recipe/fully_async_policy/detach_utils.py index ad12ef69057..fe6fb8cdc69 100644 --- a/recipe/fully_async_policy/detach_utils.py +++ b/recipe/fully_async_policy/detach_utils.py @@ -46,6 +46,8 @@ class RolloutSample: # Processing metadata processing_times: list[float] param_version: int + param_version_start: list[int] + param_version_end: list[int] rollout_status: dict[str, Any] @@ -149,7 +151,8 @@ def merge_rollout_sample(config, tokenizer, rs: RolloutSample): rs.processing_times = [] for agent_loop in rs.agent_loop_output_list: rs.processing_times.append(agent_loop.metrics.generate_sequences) - + rs.param_version_start = [agent_loop.param_version_start for agent_loop in rs.agent_loop_output_list] + rs.param_version_end = [agent_loop.param_version_end for agent_loop in rs.agent_loop_output_list] # 第四步,清空 agent_loop_output_list rs.agent_loop_output_list = [] @@ -206,24 +209,34 @@ def assemble_batch_from_rollout_samples( # 收集统计信息和元数据(直接从 RolloutSample 中获取) param_versions = [rs.param_version for rs in rollout_samples] + trajectorys_param_versions = [version for rs in rollout_samples for version in rs.param_version_end] processing_time_stats = { - "avg_processing_time": np.mean(processing_times), - "max_processing_time": np.max(processing_times), - "min_processing_time": np.min(processing_times), - "tp50_processing_time": np.percentile(processing_times, 50), # 中位数 - "tp99_processing_time": np.percentile(processing_times, 99), # 99百分位 - "tp95_processing_time": np.percentile(processing_times, 95), # 95百分位也很有用 + "processing_time/avg": np.mean(processing_times), + "processing_time/max": np.max(processing_times), + "processing_time/min": np.min(processing_times), + "processing_time/tp50": np.percentile(processing_times, 50), # 中位数 + "processing_time/tp99": np.percentile(processing_times, 99), # 99百分位 + "processing_time/tp95": np.percentile(processing_times, 95), # 95百分位也很有用 } processing_time_stats = {f"fully_async/{key}": value for key, value in processing_time_stats.items()} + param_version_diff = [abs(a - b) for a, b in zip(rs.param_version_end, rs.param_version_start)] + num_diff0 = param_version_diff.count(0) + partial_stats = { + "fully_async/partial/total_partial_num": len(param_version_diff) - num_diff0, + "fully_async/partial/partial_ratio": (len(param_version_diff) - num_diff0) / len(param_version_diff), + "fully_async/partial/max_partial_span": max(param_version_diff), + } # 创建 meta_info final_batch.meta_info.update( { "rollout_param_versions": param_versions, "param_version_diversity": len(set(param_versions)) if param_versions else 0, + "trajectory_param_versions": trajectorys_param_versions, **processing_time_stats, **rollout_status, + **partial_stats, } ) @@ -255,6 +268,14 @@ def _init_aggregation_rules(self) -> dict[str, dict[str, list[str]]]: return { # Time-Based metrics, can add metrics here "time_sum": ["perf/time_per_step"], + "last": [ + "fully_async/count/total_generated_samples", + "fully_async/count/stale_samples_processed", + "fully_async/count/stale_trajectory_processed" + "fully_async/count/current_param_version", + "fully_async/count/dropped_stale_samples", + "training/global_step", # TODO 改为total_step + ], } def add_step_metrics(self, metrics: dict[str, Any], sample_count: int, timestamp: float = None): @@ -293,12 +314,6 @@ def _get_aggregation_type(self, metric_name: str) -> str: if any(keyword in metric_lower for keyword in ["weighted_avg"]): return "weighted_avg" - import warnings - - warnings.warn( - f"No aggregation rule is matched in init_aggregation_rules. \ - For metric {metric_name}, the 'avg' method is used" - ) return "avg" def _aggregate_single_metric(self, metric_name: str, values: list[float]) -> float: @@ -372,10 +387,10 @@ def _special_metrics_aggergate(self, aggregated: dict[str, Any]) -> dict[str, An aggregated["perf/throughput"] = aggregated["perf/total_num_tokens"] / ( aggregated["perf/time_per_step"] * self.total_gpus ) - + # trainer/idle_ratio if "timing_s/gen" in aggregated.keys() and "timing_s/step" in aggregated.keys(): - aggregated["trainer/idle_ratio"] = aggregated["timing_s/gen"] / aggregated["timing_s/step"] + aggregated["trainer/idle_ratio"] = aggregated["timing_s/gen"] / aggregated["timing_s/step"] return aggregated diff --git a/recipe/fully_async_policy/fully_async_rollouter.py b/recipe/fully_async_policy/fully_async_rollouter.py index 8fbed0f0b65..1027e228c18 100644 --- a/recipe/fully_async_policy/fully_async_rollouter.py +++ b/recipe/fully_async_policy/fully_async_rollouter.py @@ -203,13 +203,13 @@ async def update_param_version(self, version: int, validate: bool = False, globa timing_raw = {} idle_ratio = None if self.idle_start_time is not None and self.version_start_time is not None: - rollout_active_time = self.idle_start_time - self.version_start_time - rollout_version_time = time.time() - self.version_start_time - idle_ratio = 1 - rollout_active_time / rollout_version_time - timing_raw["rollouter/active_time"] = rollout_active_time - timing_raw["rollouter/version_time"] = rollout_version_time - timing_raw["rollouter/idle_ratio"] = idle_ratio - self.idle_start_time = None + rollout_active_time = self.idle_start_time - self.version_start_time + rollout_version_time = time.time() - self.version_start_time + idle_ratio = 1 - rollout_active_time / rollout_version_time + timing_raw["rollouter/active_time"] = rollout_active_time + timing_raw["rollouter/version_time"] = rollout_version_time + timing_raw["rollouter/idle_ratio"] = idle_ratio + self.idle_start_time = None print( f"[FullyAsyncRollouter][Public][update_param_version] " f"Parameter version updated from {old_version} to {version} " @@ -293,6 +293,8 @@ async def _feed_samples(self): sample_id=sample_id, epoch=epoch, param_version=0, # 待处理后填充 + param_version_start=[], + param_version_end=[], processing_times=[], rollout_status={}, ) @@ -391,12 +393,10 @@ async def _process_single_sample_streaming(self, rollout_sample: RolloutSample): """流式处理单个样本""" # 调用异步生成方法 agent_loop_output_list = await self.async_rollout_manager.generate_single_sample_async( - rollout_sample.full_batch, rollout_sample.agent_loop_output_list + rollout_sample.full_batch, self.current_param_version, rollout_sample.agent_loop_output_list ) # 直接更新 RolloutSample 对象,填充剩余字段 rollout_sample.agent_loop_output_list = agent_loop_output_list - rollout_sample.param_version = self.current_param_version - rollout_sample.rollout_status = await self.get_statistics() is_cancel = False # 收集所有信息 @@ -418,6 +418,8 @@ async def _process_single_sample_streaming(self, rollout_sample: RolloutSample): await self.cancel_queue.put(rollout_sample) else: # 否则放入结果队列 + rollout_sample.param_version = self.current_param_version + rollout_sample.rollout_status = await self.get_statistics() await self.result_queue.put(rollout_sample) self.processed_sample_count += 1 @@ -617,20 +619,23 @@ async def get_statistics(self) -> dict: queue_stats = self.message_queue_client.get_statistics_sync() stats = { - "current_param_version": self.current_param_version, - "total_generated_samples": self.total_generated_samples, - "staleness_samples": self.staleness_samples, - "dropped_stale_samples": self.dropped_stale_samples, - "max_queue_size": self.max_queue_size, - "queue_size": queue_stats["queue_size"], - "max_concurrent_samples": self.max_concurrent_samples, - "pending_queue_size": self.pending_queue.qsize(), - "active_tasks_size": len(self.active_tasks), - "result_queue_size": self.result_queue.qsize(), - "max_required_samples": self.max_required_samples, - "required_samples": self.required_samples, - "staleness_threshold": self.staleness_threshold, - "cancel_queue_size": self.cancel_queue.qsize(), + # static stats + "static/max_required_samples": self.max_required_samples, + "static/required_samples": self.required_samples, + "static/staleness_threshold": self.staleness_threshold, + "static/max_queue_size": self.max_queue_size, + "static/max_concurrent_samples": self.max_concurrent_samples, + # counting stats + "count/current_param_version": self.current_param_version, + "count/total_generated_samples": self.total_generated_samples, + "count/staleness_samples": self.staleness_samples, + "count/dropped_stale_samples": self.dropped_stale_samples, + # monitor stats + "monitor/active_tasks_size": len(self.active_tasks), + "monitor/queue/pending_queue_size": self.pending_queue.qsize(), + "monitor/queue/cancel_queue_size": self.cancel_queue.qsize(), + "monitor/queue/result_queue_size": self.result_queue.qsize(), + "monitor/queue/mq_queue_size": queue_stats["queue_size"], } return stats diff --git a/recipe/fully_async_policy/fully_async_trainer.py b/recipe/fully_async_policy/fully_async_trainer.py index 0f0c35d7db5..2f8bf2ddfa5 100644 --- a/recipe/fully_async_policy/fully_async_trainer.py +++ b/recipe/fully_async_policy/fully_async_trainer.py @@ -107,6 +107,7 @@ def __init__( self.local_trigger_step = 1 self.processed_samples = 0 self.stale_samples_processed = 0 + self.stale_trajectory_processed = 0 self.current_param_version = 0 self.total_train_steps = None self.progress_bar = None @@ -262,7 +263,7 @@ def fit(self): if val_data.metrics: self.logger.log(data=val_data.metrics, step=val_data.param_version) pprint(f"[FullyAsyncTrainer] Initial validation metrics: {val_data.metrics}") - self.logger.log(data=val_data.timing_raw, step=val_data.param_version) + self.logger.log(data=val_data.timing_raw, step=val_data.param_version) # Use queue mode, no need for traditional dataloader iterator # Initialize to get the first batch of data @@ -275,23 +276,7 @@ def fit(self): epoch, batch = self._get_samples_from_queue() if batch is None: break - - # 从meta_info中获取参数版本信息 - if hasattr(batch, "meta_info") and batch.meta_info: - # 统计陈旧样本 - rollout_param_versions = batch.meta_info["rollout_param_versions"] - stale_count = sum(1 for v in rollout_param_versions if self.current_param_version - v > 1) - self.stale_samples_processed += stale_count - metrics.update( - { - "fully_async/stale_samples_ratio": stale_count / len(rollout_param_versions), - "fully_async/stale_samples_processed": self.stale_samples_processed, - "fully_async/current_param_version": self.current_param_version, - } - ) - for key, value in batch.meta_info.items(): - if key.startswith("fully_async"): - metrics[key] = value + self._collect_metrics_from_samples(batch, metrics) batch, reward_extra_infos_dict = self._process_batch_common(batch, metrics, timing_raw) self._log_rollout(batch, reward_extra_infos_dict, timing_raw) @@ -342,6 +327,28 @@ def fit(self): def load_checkpoint(self): return self._load_checkpoint() + def _collect_metrics_from_samples(self, batch, metrics): + """ + Collect metrics from samples + """ + if hasattr(batch, "meta_info") and batch.meta_info: + samples_param_versions = batch.meta_info["rollout_param_versions"] + stale_count = sum(1 for v in samples_param_versions if self.current_param_version - v > 1) + self.stale_samples_processed += stale_count + trajectory_param_versions = batch.meta_info["trajectory_param_versions"] + stale_traj_count = sum(1 for v in trajectory_param_versions if self.current_param_version - v > 1) + self.stale_trajectory_processed += stale_traj_count + metrics.update( + { + "fully_async/count/stale_samples_processed": self.stale_samples_processed, + "fully_async/count/stale_trajectory_processed": self.stale_trajectory_processed, + "fully_async/count/current_param_version": self.current_param_version, + } + ) + for key, value in batch.meta_info.items(): + if key.startswith("fully_async"): + metrics[key] = value + def _trigger_parameter_sync_after_step(self, validate: bool = False, global_steps: int = None): """ Trigger parameter synchronization after training step From f029e30967499bcf51899ccffbdf44738ed80b96 Mon Sep 17 00:00:00 2001 From: ArronHZG Date: Wed, 17 Sep 2025 10:55:49 +0800 Subject: [PATCH 142/182] refactor 2 --- .../fully_async_policy/agent_loop/__init__.py | 3 +++ .../agent_loop/agent_loop.py | 4 ++-- verl/experimental/agent_loop/agent_loop.py | 23 +++++++++++-------- 3 files changed, 19 insertions(+), 11 deletions(-) diff --git a/recipe/fully_async_policy/agent_loop/__init__.py b/recipe/fully_async_policy/agent_loop/__init__.py index 5f059078964..f1e1c647e51 100644 --- a/recipe/fully_async_policy/agent_loop/__init__.py +++ b/recipe/fully_async_policy/agent_loop/__init__.py @@ -14,3 +14,6 @@ from .partial_single_turn_agent_loop import PartialSingleTurnAgentLoop _ = [PartialSingleTurnAgentLoop] + + +from .agent_loop import FullyAgentLoopManager \ No newline at end of file diff --git a/recipe/fully_async_policy/agent_loop/agent_loop.py b/recipe/fully_async_policy/agent_loop/agent_loop.py index 38c461629dc..4527347994e 100644 --- a/recipe/fully_async_policy/agent_loop/agent_loop.py +++ b/recipe/fully_async_policy/agent_loop/agent_loop.py @@ -60,7 +60,7 @@ class PartialAgentLoopOutput(AgentLoopOutput): @ray.remote -class FullyAgentLoopWorker(AgentLoopWorker): +class FullyAgentLoopWorker(AgentLoopWorkerBase): def __init__( self, config: DictConfig, server_handles: list[ray.actor.ActorHandle], rm_executor: BatchExecutor = None ): @@ -158,8 +158,8 @@ async def _partial_run_agent_loop( class FullyAgentLoopManager(AgentLoopManager): def __init__(self, config: DictConfig, worker_group: RayWorkerGroup = None, rm_wg: RayWorkerGroup = None): - super().__init__(config, worker_group, rm_wg) self.AgentLoopWorker = FullyAgentLoopWorker + super().__init__(config, worker_group, rm_wg) async def generate_single_sample_async( self, diff --git a/verl/experimental/agent_loop/agent_loop.py b/verl/experimental/agent_loop/agent_loop.py index f520f89472a..9458c8f8123 100644 --- a/verl/experimental/agent_loop/agent_loop.py +++ b/verl/experimental/agent_loop/agent_loop.py @@ -84,12 +84,12 @@ def _choose_server(self, request_id: str) -> ray.actor.ActorHandle: @rollout_trace_op async def generate( - self, - request_id, - *, - prompt_ids: list[int], - sampling_params: dict[str, Any], - image_data: Optional[list[Any]] = None, + self, + request_id, + *, + prompt_ids: list[int], + sampling_params: dict[str, Any], + image_data: Optional[list[Any]] = None, ) -> TokenOutput: """Generate tokens from prompt ids. @@ -350,8 +350,7 @@ def compute_score( return {"reward_score": reward_score, "reward_extra_info": reward_extra_info} -@ray.remote -class AgentLoopWorker: +class AgentLoopWorkerBase: """Agent loop worker takes a batch of messages and run each message in an agent loop.""" def __init__( @@ -690,6 +689,12 @@ def _postprocess(self, inputs: list[_InternalAgentLoopOutput]) -> DataProto: ) +@ray.remote +class AgentLoopWorker(AgentLoopWorkerBase): + def __init__(self, config: DictConfig, server_handles: list[ray.actor.ActorHandle], + rm_executor: BatchExecutor = None): + super().__init__(config, server_handles, rm_executor) + async def get_trajectory_info(step, index, validate): """Get trajectory info. @@ -854,7 +859,7 @@ def _performance_metrics(self, metrics: list[list[dict[str, str]]], output: Data return timing - def wake_up(self): + async def wake_up(self): """Wake up all rollout replica instances.""" self._run_all([replica.wake_up() for replica in self.rollout_replicas]) From 94d681dce4911c3ad6abb011a2d03c62d318868d Mon Sep 17 00:00:00 2001 From: ArronHZG Date: Wed, 17 Sep 2025 11:03:52 +0800 Subject: [PATCH 143/182] qwen3-32b-64-64 --- .../exp/qwen2-32B_128/fsdp2_fully-async_64-64/run.sh | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/recipe/fully_async_policy/exp/qwen2-32B_128/fsdp2_fully-async_64-64/run.sh b/recipe/fully_async_policy/exp/qwen2-32B_128/fsdp2_fully-async_64-64/run.sh index 8427547d161..48be3ab3c84 100644 --- a/recipe/fully_async_policy/exp/qwen2-32B_128/fsdp2_fully-async_64-64/run.sh +++ b/recipe/fully_async_policy/exp/qwen2-32B_128/fsdp2_fully-async_64-64/run.sh @@ -2,7 +2,7 @@ set -xeuo pipefail project_name='DAPO' -exp_name='dapo_qwen2-32B_20k_fsdp2_fully-async_64-64-tps1' +exp_name='dapo_qwen2-32B_20k_fsdp2_fully-async_64-64' # Paths MODEL_PATH=/mnt/dolphinfs/ssd_pool/docker/user/hadoop-friday-studio/.friday/models/Qwen2.5-32B @@ -62,11 +62,11 @@ NGPUS_PER_NODE=${NGPUS_PER_NODE:-8} train_prompt_bsz=0 gen_prompt_bsz=1 n_resp_per_prompt=16 -train_prompt_mini_bsz=128 -total_rollout_steps=$(((512*400))) +train_prompt_mini_bsz=16 +total_rollout_steps=$(((512*200))) test_freq=20 staleness_threshold=0.1 -trigger_parameter_sync_step=1 +trigger_parameter_sync_step=8 partial_rollout=True python -m recipe.fully_async_policy.fully_async_main \ From a382f9af5b496b4778721f281bf85750a4788357 Mon Sep 17 00:00:00 2001 From: wangshulin02 <953550366@qq.com> Date: Wed, 17 Sep 2025 11:49:25 +0800 Subject: [PATCH 144/182] add param_sync time log --- recipe/fully_async_policy/fully_async_trainer.py | 14 +++++++++----- 1 file changed, 9 insertions(+), 5 deletions(-) diff --git a/recipe/fully_async_policy/fully_async_trainer.py b/recipe/fully_async_policy/fully_async_trainer.py index 2f8bf2ddfa5..3a81fcc1892 100644 --- a/recipe/fully_async_policy/fully_async_trainer.py +++ b/recipe/fully_async_policy/fully_async_trainer.py @@ -366,9 +366,13 @@ def _trigger_parameter_sync_after_step(self, validate: bool = False, global_step ) self.progress_bar.update(1) self.metrics_aggregator.reset() - ray.get(self.param_synchronizer.wait_last_valid.remote()) - ray.get( - self.param_synchronizer.sync_weights.remote( - self.current_param_version, validate=validate, global_steps=global_steps + timing_param_sync = {} + with marked_timer("timing_s/wait_last_valid", timing_param_sync): + ray.get(self.param_synchronizer.wait_last_valid.remote()) + with marked_timer("timing_s/param_sync", timing_param_sync): + ray.get( + self.param_synchronizer.sync_weights.remote( + self.current_param_version, validate=validate, global_steps=global_steps + ) ) - ) + self.logger.log(data=timing_param_sync, step=self.current_param_version) \ No newline at end of file From e6d51d32c20ee902abe4713e632baf17847e371d Mon Sep 17 00:00:00 2001 From: wangshulin02 <953550366@qq.com> Date: Wed, 17 Sep 2025 11:52:09 +0800 Subject: [PATCH 145/182] fix typo --- recipe/fully_async_policy/agent_loop/agent_loop.py | 1 + recipe/fully_async_policy/detach_utils.py | 2 +- recipe/fully_async_policy/fully_async_trainer.py | 2 +- 3 files changed, 3 insertions(+), 2 deletions(-) diff --git a/recipe/fully_async_policy/agent_loop/agent_loop.py b/recipe/fully_async_policy/agent_loop/agent_loop.py index 72d1b29c1c0..2e61b0fc725 100644 --- a/recipe/fully_async_policy/agent_loop/agent_loop.py +++ b/recipe/fully_async_policy/agent_loop/agent_loop.py @@ -701,6 +701,7 @@ def async_server_class( if rollout_backend == "vllm": from recipe.fully_async_policy.vllm_rollout.vllm_async_server import AsyncvLLMServer + return AsyncvLLMServer else: raise NotImplementedError(f"rollout backend {rollout_backend} is not supported") diff --git a/recipe/fully_async_policy/detach_utils.py b/recipe/fully_async_policy/detach_utils.py index fe6fb8cdc69..133d8178ee7 100644 --- a/recipe/fully_async_policy/detach_utils.py +++ b/recipe/fully_async_policy/detach_utils.py @@ -271,7 +271,7 @@ def _init_aggregation_rules(self) -> dict[str, dict[str, list[str]]]: "last": [ "fully_async/count/total_generated_samples", "fully_async/count/stale_samples_processed", - "fully_async/count/stale_trajectory_processed" + "fully_async/count/stale_trajectory_processed", "fully_async/count/current_param_version", "fully_async/count/dropped_stale_samples", "training/global_step", # TODO 改为total_step diff --git a/recipe/fully_async_policy/fully_async_trainer.py b/recipe/fully_async_policy/fully_async_trainer.py index 3a81fcc1892..f014993f13e 100644 --- a/recipe/fully_async_policy/fully_async_trainer.py +++ b/recipe/fully_async_policy/fully_async_trainer.py @@ -375,4 +375,4 @@ def _trigger_parameter_sync_after_step(self, validate: bool = False, global_step self.current_param_version, validate=validate, global_steps=global_steps ) ) - self.logger.log(data=timing_param_sync, step=self.current_param_version) \ No newline at end of file + self.logger.log(data=timing_param_sync, step=self.current_param_version) From d759cfe92e4329ce160c7efcbe0bff0ee0e5cca4 Mon Sep 17 00:00:00 2001 From: wangshulin02 <953550366@qq.com> Date: Wed, 17 Sep 2025 13:10:07 +0800 Subject: [PATCH 146/182] fix typo --- recipe/fully_async_policy/agent_loop/agent_loop.py | 4 +--- recipe/fully_async_policy/detach_utils.py | 2 +- 2 files changed, 2 insertions(+), 4 deletions(-) diff --git a/recipe/fully_async_policy/agent_loop/agent_loop.py b/recipe/fully_async_policy/agent_loop/agent_loop.py index 2e61b0fc725..901e584f7c3 100644 --- a/recipe/fully_async_policy/agent_loop/agent_loop.py +++ b/recipe/fully_async_policy/agent_loop/agent_loop.py @@ -34,6 +34,7 @@ from verl.utils import hf_tokenizer from verl.utils.fs import copy_to_local from verl.utils.rollout_trace import RolloutTraceConfig, rollout_trace_attr, rollout_trace_op +from verl.workers.rollout.async_server import AsyncServerBase logger = logging.getLogger(__file__) logger.setLevel(os.getenv("VERL_LOGGING_LEVEL", "WARN")) @@ -678,9 +679,6 @@ async def resume_async(self): await asyncio.gather(*[asyncio.wrap_future(future.future()) for future in futures], return_exceptions=True) -from verl.workers.rollout.async_server import AsyncServerBase - - def async_server_class( rollout_backend: str, rollout_backend_module: Optional[str] = None, rollout_backend_class: Optional[str] = None ) -> type[AsyncServerBase]: diff --git a/recipe/fully_async_policy/detach_utils.py b/recipe/fully_async_policy/detach_utils.py index 133d8178ee7..32a36bb882a 100644 --- a/recipe/fully_async_policy/detach_utils.py +++ b/recipe/fully_async_policy/detach_utils.py @@ -289,7 +289,7 @@ def add_step_metrics(self, metrics: dict[str, Any], sample_count: int, timestamp # Store all metrics values for key, value in metrics.items(): - if isinstance(value, (int, float, np.number)): + if isinstance(value, int | float | np.number): self.metric_values[key].append(float(value)) elif isinstance(value, torch.Tensor): self.metric_values[key].append(float(value.item())) From c8db507eb62054974c40a5691078e55147ac7055 Mon Sep 17 00:00:00 2001 From: ArronHZG Date: Wed, 17 Sep 2025 15:40:14 +0800 Subject: [PATCH 147/182] refactor 3 --- .../fully_async_policy/agent_loop/__init__.py | 5 +- .../agent_loop/agent_loop.py | 84 ++-- .../partial_single_turn_agent_loop.py | 27 +- .../agent_loop/single_turn_agent_loop.py | 55 --- recipe/fully_async_policy/detach_utils.py | 4 +- .../fully_async_rollouter.py | 4 +- .../fully_async_policy/fully_async_trainer.py | 2 - recipe/fully_async_policy/ray_trainer.py | 4 +- .../vllm_rollout/__init__.py | 13 + .../vllm_rollout/vllm_async_server.py | 364 +++--------------- verl/experimental/agent_loop/agent_loop.py | 70 ++-- .../rollout/vllm_rollout/vllm_async_server.py | 6 +- 12 files changed, 170 insertions(+), 468 deletions(-) delete mode 100644 recipe/fully_async_policy/agent_loop/single_turn_agent_loop.py diff --git a/recipe/fully_async_policy/agent_loop/__init__.py b/recipe/fully_async_policy/agent_loop/__init__.py index f1e1c647e51..40dcd0ac7a3 100644 --- a/recipe/fully_async_policy/agent_loop/__init__.py +++ b/recipe/fully_async_policy/agent_loop/__init__.py @@ -1,4 +1,4 @@ -# Copyright 2024 Bytedance Ltd. and/or its affiliates +# Copyright 2025 Meituan Ltd. and/or its affiliates # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -13,7 +13,8 @@ # limitations under the License. from .partial_single_turn_agent_loop import PartialSingleTurnAgentLoop + _ = [PartialSingleTurnAgentLoop] -from .agent_loop import FullyAgentLoopManager \ No newline at end of file +from .agent_loop import PartialAgentLoopManager diff --git a/recipe/fully_async_policy/agent_loop/agent_loop.py b/recipe/fully_async_policy/agent_loop/agent_loop.py index 4527347994e..72f7c0afc2e 100644 --- a/recipe/fully_async_policy/agent_loop/agent_loop.py +++ b/recipe/fully_async_policy/agent_loop/agent_loop.py @@ -1,4 +1,4 @@ -# Copyright 2024 Bytedance Ltd. and/or its affiliates +# Copyright 2025 Meituan Ltd. and/or its affiliates # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -14,23 +14,9 @@ import asyncio import logging import os -from typing import Any, Optional - -import hydra -import numpy as np -import ray -import torch -from omegaconf import DictConfig, OmegaConf -from tensordict import TensorDict - -from verl.protocol import DataProto -from verl.single_controller.ray.base import RayWorkerGroup -from verl.utils import hf_tokenizer -from verl.utils.fs import copy_to_local -from verl.utils.rollout_trace import RolloutTraceConfig, rollout_trace_attr -from verl.workers.rollout.replica import TokenOutput from verl.experimental.agent_loop.agent_loop import AgentLoopOutput, _agent_loop_registry, _DummyConfig +from verl.protocol import DataProto logger = logging.getLogger(__file__) logger.setLevel(os.getenv("VERL_LOGGING_LEVEL", "WARN")) @@ -60,15 +46,15 @@ class PartialAgentLoopOutput(AgentLoopOutput): @ray.remote -class FullyAgentLoopWorker(AgentLoopWorkerBase): +class PartialAgentLoopWorker(AgentLoopWorkerBase): def __init__( - self, config: DictConfig, server_handles: list[ray.actor.ActorHandle], rm_executor: BatchExecutor = None + self, config: DictConfig, server_handles: list[ray.actor.ActorHandle], rm_executor: BatchExecutor = None ): self.AsyncLLMServerManager = PartialAsyncLLMServerManager super().__init__(config, server_handles, rm_executor) async def generate_sequences_no_post( - self, batch: DataProto, partial_output_list: Optional[list[AgentLoopOutput]] + self, batch: DataProto, partial_output_list: Optional[list[AgentLoopOutput]] ) -> list[AgentLoopOutput]: """Generate sequences from agent loop. @@ -105,7 +91,7 @@ async def generate_sequences_no_post( if "index" in batch.non_tensor_batch: index = batch.non_tensor_batch["index"] else: - index = np.arange(len(raw_prompts)) + index = np.arange(len(batch)) trajectory_info = await get_trajectory_info( batch.meta_info.get("global_steps", -1), index, batch.meta_info.get("validate", False) @@ -117,29 +103,26 @@ async def generate_sequences_no_post( tasks = [] for i in range(len(batch)): kwargs = {k: v[i] for k, v in batch.non_tensor_batch.items()} + kwargs["output"] = partial_output_list[i] tasks.append( - asyncio.create_task( - self._partial_run_agent_loop(sampling_params, - trajectory_info[i], - partial_output_list[i], - **kwargs))) + asyncio.create_task(self._partial_run_agent_loop(sampling_params, trajectory_info[i], **kwargs)) + ) return await asyncio.gather(*tasks) async def _partial_run_agent_loop( - self, - sampling_params: dict[str, Any], - trajectory: dict[str, Any], - partial_output: Optional[AgentLoopOutput] = None, - *, - agent_name: str, - **kwargs, + self, + sampling_params: dict[str, Any], + trajectory: dict[str, Any], + *, + agent_name: str, + **kwargs, ) -> AgentLoopOutput: with rollout_trace_attr( - step=trajectory["step"], - sample_index=trajectory["sample_index"], - rollout_n=trajectory["rollout_n"], - validate=trajectory["validate"], - name="agent_loop", + step=trajectory["step"], + sample_index=trajectory["sample_index"], + rollout_n=trajectory["rollout_n"], + validate=trajectory["validate"], + name="agent_loop", ): assert agent_name in _agent_loop_registry, ( f"Agent loop {agent_name} not registered, registered agent loops: {_agent_loop_registry.keys()}" @@ -153,18 +136,18 @@ async def _partial_run_agent_loop( tokenizer=self.tokenizer, processor=self.processor, ) - return await agent_loop.run(sampling_params, partial_output, **kwargs) + return await agent_loop.run(sampling_params, **kwargs) -class FullyAgentLoopManager(AgentLoopManager): +class PartialAgentLoopManager(AgentLoopManager): def __init__(self, config: DictConfig, worker_group: RayWorkerGroup = None, rm_wg: RayWorkerGroup = None): - self.AgentLoopWorker = FullyAgentLoopWorker + self.AgentLoopWorker = PartialAgentLoopWorker super().__init__(config, worker_group, rm_wg) async def generate_single_sample_async( - self, - sample: DataProto, - partial_output_list: Optional[list[AgentLoopOutput]], + self, + sample: DataProto, + partial_output_list: Optional[list[AgentLoopOutput]], ) -> list[AgentLoopOutput]: """ 异步处理单个样本, 需要复制n次 @@ -191,12 +174,23 @@ def _select_best_worker(self): self._worker_index = (self._worker_index + 1) % len(self.agent_loop_workers) return worker + async def sleep(self): + futures = [replica.sleep.remote() for replica in self.rollout_replicas] + await asyncio.gather(*[asyncio.wrap_future(future.future()) for future in futures], return_exceptions=True) + + async def wake_up(self): + futures = [replica.wake_up.remote() for replica in self.rollout_replicas] + await asyncio.gather(*[asyncio.wrap_future(future.future()) for future in futures], return_exceptions=True) + async def cancel_async(self): """Cancel all rollout tasks asynchronously.""" - futures = [server.cancel.remote() for server in self.async_llm_servers] + futures = [replica.cancel.remote() for replica in self.rollout_replicas] await asyncio.gather(*[asyncio.wrap_future(future.future()) for future in futures], return_exceptions=True) async def resume_async(self): """Cancel all rollout tasks asynchronously.""" - futures = [server.resume.remote() for server in self.async_llm_servers] + futures = [replica.resume.remote() for replica in self.rollout_replicas] await asyncio.gather(*[asyncio.wrap_future(future.future()) for future in futures], return_exceptions=True) + + def _run_all(self, tasks: list[asyncio.Task]): + pass diff --git a/recipe/fully_async_policy/agent_loop/partial_single_turn_agent_loop.py b/recipe/fully_async_policy/agent_loop/partial_single_turn_agent_loop.py index cf95c1eb965..5e512093bfe 100644 --- a/recipe/fully_async_policy/agent_loop/partial_single_turn_agent_loop.py +++ b/recipe/fully_async_policy/agent_loop/partial_single_turn_agent_loop.py @@ -1,4 +1,4 @@ -# Copyright 2024 Bytedance Ltd. and/or its affiliates +# Copyright 2025 Meituan Ltd. and/or its affiliates # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -16,7 +16,12 @@ from typing import Any, Optional from uuid import uuid4 -from recipe.fully_async_policy.agent_loop.agent_loop import AgentLoopBase, AgentLoopOutput, register +from recipe.fully_async_policy.agent_loop.agent_loop import ( + AgentLoopBase, + AgentLoopOutput, + PartialAgentLoopOutput, + register, +) from verl.utils.profiler import simple_timer logger = logging.getLogger(__file__) @@ -31,13 +36,21 @@ def __init__(self, *args, **kwargs): super().__init__(*args, **kwargs) self.prompt_length = self.config.actor_rollout_ref.rollout.prompt_length self.response_length = self.config.actor_rollout_ref.rollout.response_length + self.apply_chat_template_kwargs = self.config.data.get("apply_chat_template_kwargs", {}) + + async def run(self, sampling_params: dict[str, Any], **kwargs) -> AgentLoopOutput: + output: Optional[PartialAgentLoopOutput] = kwargs.get("output", None) + messages = list(kwargs["raw_prompt"]) + + metrics = {} + request_id = uuid4().hex - async def run( - self, messages: list[dict[str, Any]], sampling_params: dict[str, Any], output: Optional[AgentLoopOutput] - ) -> AgentLoopOutput: if not output: prompt_ids = await self.loop.run_in_executor( - None, lambda: self.tokenizer.apply_chat_template(messages, add_generation_prompt=True, tokenize=True) + None, + lambda: self.tokenizer.apply_chat_template( + messages, add_generation_prompt=True, tokenize=True, **self.apply_chat_template_kwargs + ), ) else: if output.is_cancel: @@ -63,7 +76,7 @@ async def run( response_ids = output.response_ids + response_ids response_mask = [1] * len(response_ids) - return AgentLoopOutput( + return PartialAgentLoopOutput( prompt_ids=prompt_ids, response_ids=response_ids[: self.response_length], response_mask=response_mask[: self.response_length], diff --git a/recipe/fully_async_policy/agent_loop/single_turn_agent_loop.py b/recipe/fully_async_policy/agent_loop/single_turn_agent_loop.py deleted file mode 100644 index 6dcdf327b09..00000000000 --- a/recipe/fully_async_policy/agent_loop/single_turn_agent_loop.py +++ /dev/null @@ -1,55 +0,0 @@ -# Copyright 2024 Bytedance Ltd. and/or its affiliates -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -import logging -import os -from typing import Any -from uuid import uuid4 - -from recipe.fully_async_policy.agent_loop.agent_loop import AgentLoopBase, AgentLoopOutput, register -from verl.utils.profiler import simple_timer - -logger = logging.getLogger(__file__) -logger.setLevel(os.getenv("VERL_LOGGING_LEVEL", "WARN")) - - -@register("single_turn_agent") -class SingleTurnAgentLoop(AgentLoopBase): - """Naive agent loop that only do single turn chat completion.""" - - def __init__(self, *args, **kwargs): - super().__init__(*args, **kwargs) - self.prompt_length = self.config.actor_rollout_ref.rollout.prompt_length - self.response_length = self.config.actor_rollout_ref.rollout.response_length - - async def run(self, messages: list[dict[str, Any]], sampling_params: dict[str, Any]) -> AgentLoopOutput: - metrics = {} - request_id = uuid4().hex - prompt_ids = await self.loop.run_in_executor( - None, lambda: self.tokenizer.apply_chat_template(messages, add_generation_prompt=True, tokenize=True) - ) - - with simple_timer("generate_sequences", metrics): - response_ids = await self.server_manager.generate( - request_id=request_id, prompt_ids=prompt_ids, sampling_params=sampling_params - ) - response_mask = [1] * len(response_ids) - - output = AgentLoopOutput( - prompt_ids=prompt_ids, - response_ids=response_ids[: self.response_length], - response_mask=response_mask[: self.response_length], - num_turns=2, - metrics=metrics, - ) - return output diff --git a/recipe/fully_async_policy/detach_utils.py b/recipe/fully_async_policy/detach_utils.py index 69041d923b5..450b67b9ff9 100644 --- a/recipe/fully_async_policy/detach_utils.py +++ b/recipe/fully_async_policy/detach_utils.py @@ -228,7 +228,7 @@ def merge_rollout_sample(config, tokenizer, rs: RolloutSample): def assemble_batch_from_rollout_samples( - rollout_samples: list[RolloutSample], tokenizer, config, balance_batch=None + rollout_samples: list[RolloutSample], tokenizer, config, balance_batch=None ) -> DataProto: """ Assemble gen_batch_output from RolloutSample objects @@ -439,7 +439,7 @@ def _special_metrics_aggergate(self, aggregated: dict[str, Any]) -> dict[str, An REQUIRED_PERF_KEYS = {"perf/throughput", "perf/total_num_tokens", "perf/time_per_step"} if REQUIRED_PERF_KEYS.issubset(aggregated): aggregated["perf/throughput"] = aggregated["perf/total_num_tokens"] / ( - aggregated["perf/time_per_step"] * self.total_gpus + aggregated["perf/time_per_step"] * self.total_gpus ) return aggregated diff --git a/recipe/fully_async_policy/fully_async_rollouter.py b/recipe/fully_async_policy/fully_async_rollouter.py index e53e6c43ef5..048f727ea0d 100644 --- a/recipe/fully_async_policy/fully_async_rollouter.py +++ b/recipe/fully_async_policy/fully_async_rollouter.py @@ -258,10 +258,10 @@ def _create_continuous_iterator(self): def _init_async_rollout_manager(self): # create async rollout manager and request scheduler assert self.config.actor_rollout_ref.rollout.mode == "async" - from recipe.fully_async_policy.agent_loop import FullyAgentLoopManager + from recipe.fully_async_policy.agent_loop import PartialAgentLoopManager self.async_rollout_mode = True - self.async_rollout_manager = FullyAgentLoopManager( + self.async_rollout_manager = PartialAgentLoopManager( config=self.config, worker_group=self.rollout_wg, ) diff --git a/recipe/fully_async_policy/fully_async_trainer.py b/recipe/fully_async_policy/fully_async_trainer.py index 4cba527c857..5d945137ab2 100644 --- a/recipe/fully_async_policy/fully_async_trainer.py +++ b/recipe/fully_async_policy/fully_async_trainer.py @@ -93,8 +93,6 @@ def __init__( ) self.use_critic = False - self._validate_config() - self.message_queue_client = None self.param_synchronizer = None diff --git a/recipe/fully_async_policy/ray_trainer.py b/recipe/fully_async_policy/ray_trainer.py index 0a74c5ed386..33601621993 100644 --- a/recipe/fully_async_policy/ray_trainer.py +++ b/recipe/fully_async_policy/ray_trainer.py @@ -175,10 +175,10 @@ def _init_async_rollout_manager(self): # create async rollout manager and request scheduler self.async_rollout_mode = False if self.config.actor_rollout_ref.rollout.mode == "async": - from recipe.fully_async_policy.agent_loop.agent_loop import FullyAgentLoopManager + from recipe.fully_async_policy.agent_loop.agent_loop import PartialAgentLoopManager self.async_rollout_mode = True - self.async_rollout_manager = FullyAgentLoopManager( + self.async_rollout_manager = PartialAgentLoopManager( config=self.config, worker_group=self.actor_rollout_wg, ) diff --git a/recipe/fully_async_policy/vllm_rollout/__init__.py b/recipe/fully_async_policy/vllm_rollout/__init__.py index e69de29bb2d..9cd3ed5b8e9 100644 --- a/recipe/fully_async_policy/vllm_rollout/__init__.py +++ b/recipe/fully_async_policy/vllm_rollout/__init__.py @@ -0,0 +1,13 @@ +# Copyright 2025 Meituan Ltd. and/or its affiliates +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. diff --git a/recipe/fully_async_policy/vllm_rollout/vllm_async_server.py b/recipe/fully_async_policy/vllm_rollout/vllm_async_server.py index 4826ebaa1d0..19a70c8d44b 100644 --- a/recipe/fully_async_policy/vllm_rollout/vllm_async_server.py +++ b/recipe/fully_async_policy/vllm_rollout/vllm_async_server.py @@ -1,4 +1,4 @@ -# Copyright 2024 Bytedance Ltd. and/or its affiliates +# Copyright 2025 Meituan Ltd. and/or its affiliates # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -13,199 +13,39 @@ # limitations under the License. import asyncio import logging -import os -import pickle -from typing import Any, Callable, Optional, Sequence +from typing import Any, Optional, Sequence import ray -import zmq from omegaconf import DictConfig -from starlette.requests import Request -from starlette.responses import JSONResponse, StreamingResponse +from ray.actor import ActorHandle from vllm import SamplingParams -from vllm.engine.arg_utils import AsyncEngineArgs -from vllm.entrypoints.logger import RequestLogger -from vllm.entrypoints.openai.protocol import ChatCompletionRequest, ChatCompletionResponse, ErrorResponse -from vllm.entrypoints.openai.serving_chat import OpenAIServingChat -from vllm.entrypoints.openai.serving_models import BaseModelPath, OpenAIServingModels from vllm.inputs import TokensPrompt from vllm.outputs import RequestOutput -from vllm.v1.engine.async_llm import AsyncLLM -from vllm.v1.executor.abstract import Executor -from vllm.worker.worker_base import WorkerWrapperBase -from verl.utils.fs import copy_to_local -from verl.workers.rollout.async_server import AsyncServerBase +from verl.workers.rollout.replica import RolloutMode +from verl.workers.rollout.vllm_rollout.vllm_async_server import ( + _qwen2_5_vl_dedup_image_tokens, + vLLMHttpServer, + vLLMReplica, +) logger = logging.getLogger(__file__) - - -def _get_model_runner_workers(vllm_config, init_ray: bool = True): - assert vllm_config.instance_id is not None, "instance_id must be set for external ray actors." - - fields = vllm_config.instance_id.split(":") - assert len(fields) == 4, ( - f"instance_id: {vllm_config.instance_id} must be in the format of " - f":::." - ) - namespace, wg_prefix, vllm_dp_size, vllm_dp_rank = fields[0], fields[1], int(fields[2]), int(fields[3]) - - # Make sure subprocess in same namespace as parent actor. - # actor name format: {name_prefix}WorkerDict_{pg_idx}:{local_rank} - if init_ray: - ray.init(namespace=namespace) - actor_names = [ - actor_name for actor_name in ray.util.list_named_actors() if actor_name.startswith(f"{wg_prefix}WorkerDict") - ] - - vllm_tp_size = vllm_config.parallel_config.tensor_parallel_size - assert len(actor_names) == vllm_dp_size * vllm_tp_size, ( - f"instance_id: {vllm_config.instance_id} has {len(actor_names)} actors, but vllm_dp_size: " - f"{vllm_dp_size} * vllm_tp_size: {vllm_tp_size} = {vllm_dp_size * vllm_tp_size} is expected." - ) - - def get_pg_index_and_local_rank(actor_name) -> tuple[int, int]: - fields = actor_name.split(":") - assert len(fields) == 2, f"invalid actor name: {actor_name}" - pg_index, local_rank = int(fields[0].split("_")[-1]), int(fields[1]) - return pg_index, local_rank - - # sort actor names by pg_index and local_rank - actor_names = sorted(actor_names, key=get_pg_index_and_local_rank) - actor_names = actor_names[vllm_dp_rank * vllm_tp_size : (vllm_dp_rank + 1) * vllm_tp_size] - workers: list[WorkerWrapperBase] = [ray.get_actor(actor_name) for actor_name in actor_names] - print(f"instance_id: {vllm_config.instance_id} initializes with external actors: {actor_names}") - - return workers - - -class ExternalRayDistributedExecutor(Executor): - """An executor that engines are launched by external ray actors.""" - - uses_ray: bool = False - - def _init_executor(self) -> None: - self.workers = _get_model_runner_workers(vllm_config=self.vllm_config, init_ray=True) - - kwargs = dict( - vllm_config=self.vllm_config, - local_rank=None, - rank=None, - distributed_init_method="env://", - is_driver_worker=True, - ) - self.collective_rpc("init_worker", args=([kwargs],)) - self.collective_rpc("init_device") - self.collective_rpc("load_model") - print(f"instance_id: {self.vllm_config.instance_id} initializes finished.") - - def collective_rpc( - self, - method: str | Callable, - timeout: Optional[float] = None, - args: tuple = (), - kwargs: Optional[dict[str, Any]] = None, - ) -> list[Any]: - # TODO(wuxibin): support ray compiled graph - if isinstance(method, str): - sent_method = method - else: - sent_method = pickle.dumps(method) - del method - - # ~3ms overhead per schedule step due to SchedulerOutput/ModelRunnerOutput serialization/deserialization. - outputs = ray.get( - [worker.execute_method.remote(sent_method, *args, **(kwargs or {})) for worker in self.workers] - ) - return outputs - - def check_health(self): - return - - -class ExternalZeroMQDistributedExecutor(Executor): - """An executor that engines are launched by external ray actors.""" - - uses_ray: bool = False - - def _init_executor(self) -> None: - addresses = os.environ["VERL_VLLM_ZMQ_ADDRESSES"].split(",") - self.context = zmq.Context() - self.sockets = [] - for address in addresses: - socket = self.context.socket(zmq.REQ) - socket.connect(address) - self.sockets.append(socket) - - kwargs = dict( - vllm_config=self.vllm_config, - local_rank=None, - rank=None, - distributed_init_method="env://", - is_driver_worker=True, - ) - self.collective_rpc("init_worker", args=([kwargs],)) - self.collective_rpc("init_device") - self.collective_rpc("load_model") - - def collective_rpc( - self, - method: str | Callable, - timeout: Optional[float] = None, - args: tuple = (), - kwargs: Optional[dict[str, Any]] = None, - ) -> list[Any]: - if isinstance(method, str): - sent_method = method - else: - sent_method = pickle.dumps(method) - del method - - message = pickle.dumps((sent_method, args, kwargs or {})) - for socket in self.sockets: - socket.send(message, zmq.DONTWAIT) - - outputs = [] - for socket in self.sockets: - outputs.append(pickle.loads(socket.recv())) - return outputs - - def check_health(self): - return +logger.setLevel(logging.INFO) @ray.remote(num_cpus=1) -class AsyncvLLMServer(AsyncServerBase): - """ - AsyncvLLMServer is a wrapper for AsyncLLM, it uses ExternalRayDistributedExecutor to launch engines - in hybrid rollout workers, i.e AsyncActorRolloutRefWorker. - - AsyncvLLMServer works as follows: - 1. Start FastAPI server first. - 2. Initialize AsyncLLM with ExternalRayDistributedExecutor. - 3. AsyncLLM spawn EngineCore in subprocess. - 4. EngineCore initialize ExternalRayDistributedExecutor. - 5. ExternalRayDistributedExecutor lookup its corresponding actors by name. - 6. ExternalRayDistributedExecutor init executor: init_worker, init_device, load_model. - - For vLLM AsyncLLM design, see: https://github.com/vllm-project/vllm/pull/9826 - """ - - def __init__(self, config: DictConfig, vllm_dp_size: int, vllm_dp_rank: int, wg_prefix: str): - """ - Args: - config: DictConfig. - vllm_dp_size: int, vllm data parallel size. - vllm_dp_rank: int, vllm data parallel rank. - wg_prefix: str, worker group prefix, used to lookup actors. - """ - super().__init__() - - self.config = config.actor_rollout_ref - self.vllm_dp_size = vllm_dp_size - self.vllm_dp_rank = vllm_dp_rank - self.wg_prefix = wg_prefix - self.engine: AsyncLLM = None +class vLLMHttpServerForPartial(vLLMHttpServer): + def __init__( + self, + config: DictConfig, + rollout_mode: RolloutMode, + workers: list[ActorHandle], + replica_rank: int, + node_rank: int, + gpus_per_node: int, + nnodes: int, + ): + super().__init__(config, rollout_mode, workers, replica_rank, node_rank, gpus_per_node, nnodes) # for cancel LLMServer self.paused = False @@ -213,131 +53,21 @@ def __init__(self, config: DictConfig, vllm_dp_size: int, vllm_dp_rank: int, wg_ self.cancel_event: dict[str, asyncio.Event] = {} self.req_output: dict[str, Optional[RequestOutput]] = {} - async def init_engine(self): - """Init vLLM AsyncLLM engine.""" - config = self.config - model_path = config.model.path - model_name = "/".join(model_path.split("/")[-2:]) - local_path = copy_to_local(model_path) - trust_remote_code = config.model.get("trust_remote_code", False) - config = config.rollout - - tensor_parallel_size = config.get("tensor_model_parallel_size", 1) - max_num_batched_tokens = config.get("max_num_batched_tokens", 8192) - max_model_len = config.max_model_len if config.max_model_len else config.prompt_length + config.response_length - self.max_model_len = int(max_model_len) - - # Override default generation config from hugging face model config, - # user can still override them by passing kwargs in each request. - kwargs = dict( - n=1, - logprobs=0, - repetition_penalty=1.0, - max_new_tokens=config.response_length, - ) - for k in config.keys(): - if hasattr(SamplingParams(), str(k)): - kwargs[k] = config.get(k) - print(f"override_generation_config: {kwargs}") - - backend = os.environ.get("VERL_VLLM_DISTRIBUTED_BACKEND", "zeromq") - if backend == "zeromq": - distributed_executor_backend = ExternalZeroMQDistributedExecutor - elif backend == "ray": - distributed_executor_backend = ExternalRayDistributedExecutor - else: - distributed_executor_backend = None - - engine_args = AsyncEngineArgs( - model=local_path, - enable_sleep_mode=config.free_cache_engine, - override_generation_config=kwargs, - tensor_parallel_size=tensor_parallel_size, - distributed_executor_backend=distributed_executor_backend, - dtype=config.dtype, - enforce_eager=config.enforce_eager, - gpu_memory_utilization=config.gpu_memory_utilization, - disable_custom_all_reduce=True, - skip_tokenizer_init=False, - max_model_len=self.max_model_len, - load_format="auto", - disable_log_stats=config.disable_log_stats, - max_num_batched_tokens=max_num_batched_tokens, - enable_chunked_prefill=config.enable_chunked_prefill, - enable_prefix_caching=True, - trust_remote_code=trust_remote_code, - seed=config.get("seed", 0), - ) - - # init async llm engine - vllm_config = self._create_engine_config(engine_args) - self.engine = AsyncLLM.from_vllm_config(vllm_config) - - # build serving chat - model_config = self.engine.model_config - BASE_MODEL_PATHS = [BaseModelPath(name=model_name, model_path=model_path)] - models = OpenAIServingModels(self.engine, model_config, BASE_MODEL_PATHS) - self.openai_serving_chat = OpenAIServingChat( - self.engine, - model_config, - models, - "assistant", - request_logger=RequestLogger(max_log_len=4096), - chat_template=None, - chat_template_content_format="auto", - enable_auto_tools=config.multi_turn.tool_config_path is not None, - tool_parser=config.multi_turn.format, # hermes, llama3_json, ... - ) - - def _create_engine_config(self, engine_args: AsyncEngineArgs): - vllm_config = engine_args.create_engine_config() - namespace = ray.get_runtime_context().namespace - vllm_config.instance_id = f"{namespace}:{self.wg_prefix}:{self.vllm_dp_size}:{self.vllm_dp_rank}" - - # VERL_VLLM_ZMQ_ADDRESSES - if engine_args.distributed_executor_backend == ExternalZeroMQDistributedExecutor: - workers = _get_model_runner_workers(vllm_config=vllm_config, init_ray=False) - zmq_addresses = ray.get([worker.get_zeromq_address.remote() for worker in workers]) - print(f"VERL_VLLM_ZMQ_ADDRESSES: {zmq_addresses}") - os.environ["VERL_VLLM_ZMQ_ADDRESSES"] = ",".join(zmq_addresses) - - return vllm_config - - async def chat_completion(self, raw_request: Request): - """OpenAI-compatible HTTP endpoint. - - API reference: https://docs.vllm.ai/en/latest/serving/openai_compatible_server.html - """ - request_json = await raw_request.json() - request = ChatCompletionRequest(**request_json) - generator = await self.openai_serving_chat.create_chat_completion(request, raw_request) - - if isinstance(generator, ErrorResponse): - return JSONResponse(content=generator.model_dump(), status_code=generator.code) - if request.stream: - return StreamingResponse(content=generator, media_type="text/event-stream") - else: - assert isinstance(generator, ChatCompletionResponse) - return JSONResponse(content=generator.model_dump()) - - async def generate(self, prompt_ids: list[int], sampling_params: dict[str, Any], request_id: str) -> list[int]: - max_tokens = self.max_model_len - len(prompt_ids) + async def _generate_step( + self, + prompt_ids: list[int], + sampling_params: dict[str, Any], + request_id: str, + image_data: Optional[list[Any]] = None, + ): + max_tokens = self.config.max_model_len - len(prompt_ids) + sampling_params["logprobs"] = 1 + sampling_params.setdefault("repetition_penalty", self.config.get("repetition_penalty", 1.0)) sampling_params = SamplingParams(max_tokens=max_tokens, **sampling_params) - prompt = TokensPrompt(prompt_token_ids=prompt_ids) - generator = self.engine.generate(prompt=prompt, sampling_params=sampling_params, request_id=request_id) - - # Get final response - final_res: Optional[RequestOutput] = None - async for output in generator: - final_res = output - assert final_res is not None - - return final_res.outputs[0].token_ids - - async def _generate_step(self, prompt_ids: list[int], sampling_params: dict[str, Any], request_id: str): - max_tokens = self.max_model_len - len(prompt_ids) - sampling_params = SamplingParams(max_tokens=max_tokens, logprobs=1, **sampling_params) - prompt = TokensPrompt(prompt_token_ids=prompt_ids) + prompt_ids = _qwen2_5_vl_dedup_image_tokens(prompt_ids, self.model_config.processor) + prompt = TokensPrompt( + prompt_token_ids=prompt_ids, multi_modal_data={"image": image_data} if image_data else None + ) generator = self.engine.generate(prompt=prompt, sampling_params=sampling_params, request_id=request_id) # Get final response @@ -347,7 +77,11 @@ async def _generate_step(self, prompt_ids: list[int], sampling_params: dict[str, assert self.req_output[request_id] is not None async def generate_for_partial( - self, prompt_ids: list[int], sampling_params: dict[str, Any], request_id: str + self, + prompt_ids: list[int], + sampling_params: dict[str, Any], + request_id: str, + image_data: Optional[list[Any]] = None, ) -> tuple[list[Any], list[Any], bool] | tuple[Sequence[int], list[float], Any]: # 设置中断标志 async with self.lock: @@ -356,7 +90,9 @@ async def generate_for_partial( return [], [], True self.cancel_event[request_id] = asyncio.Event() cancel_handle = asyncio.create_task(self.cancel_event[request_id].wait()) - generation_handle = asyncio.create_task(self._generate_step(prompt_ids, sampling_params, request_id)) + generation_handle = asyncio.create_task( + self._generate_step(prompt_ids, sampling_params, request_id, image_data) + ) done, pend = await asyncio.wait([generation_handle, cancel_handle], return_when=asyncio.FIRST_COMPLETED) @@ -388,12 +124,8 @@ async def resume(self): async with self.lock: self.paused = False - async def wake_up(self): - if self.config.rollout.free_cache_engine: - await self.engine.wake_up() - async def sleep(self): - # TODO: https://github.com/vllm-project/vllm/issues/17103 - await self.engine.reset_prefix_cache() - if self.config.rollout.free_cache_engine: - await self.engine.sleep() +class vLLMReplicaForPartial(vLLMReplica): + def __init__(self, replica_rank: int, config: DictConfig, gpus_per_node: int = 8): + super().__init__(replica_rank, config, gpus_per_node) + self.server_class = vLLMHttpServerForPartail diff --git a/verl/experimental/agent_loop/agent_loop.py b/verl/experimental/agent_loop/agent_loop.py index 9458c8f8123..8a32691e1a9 100644 --- a/verl/experimental/agent_loop/agent_loop.py +++ b/verl/experimental/agent_loop/agent_loop.py @@ -84,12 +84,12 @@ def _choose_server(self, request_id: str) -> ray.actor.ActorHandle: @rollout_trace_op async def generate( - self, - request_id, - *, - prompt_ids: list[int], - sampling_params: dict[str, Any], - image_data: Optional[list[Any]] = None, + self, + request_id, + *, + prompt_ids: list[int], + sampling_params: dict[str, Any], + image_data: Optional[list[Any]] = None, ) -> TokenOutput: """Generate tokens from prompt ids. @@ -179,12 +179,12 @@ class AgentLoopBase(ABC): _class_initialized = False def __init__( - self, - trainer_config: _DummyConfig, - server_manager: AsyncLLMServerManager, - tokenizer: AutoTokenizer, - processor: AutoProcessor, - **kwargs, + self, + trainer_config: _DummyConfig, + server_manager: AsyncLLMServerManager, + tokenizer: AutoTokenizer, + processor: AutoProcessor, + **kwargs, ): """Initialize agent loop, each sample will have its own loop instance. @@ -329,8 +329,8 @@ def __init__(self, config: DictConfig, local_path: str, rm_executor: BatchExecut self.rm_executor = rm_executor def compute_score( - self, - data: DataProto, + self, + data: DataProto, ) -> dict: """Compute reward score for agent loop output. @@ -354,7 +354,7 @@ class AgentLoopWorkerBase: """Agent loop worker takes a batch of messages and run each message in an agent loop.""" def __init__( - self, config: DictConfig, server_handles: list[ray.actor.ActorHandle], rm_executor: BatchExecutor = None + self, config: DictConfig, server_handles: list[ray.actor.ActorHandle], rm_executor: BatchExecutor = None ): """Initialize agent loop manager. @@ -364,7 +364,7 @@ def __init__( """ self.config = config - if self.AsyncLLMServerManager == None: + if self.AsyncLLMServerManager is None: self.AsyncLLMServerManager = AsyncLLMServerManager self.server_manager = self.AsyncLLMServerManager(config, server_handles) @@ -458,19 +458,19 @@ async def generate_sequences(self, batch: DataProto) -> DataProto: return output async def _run_agent_loop( - self, - sampling_params: dict[str, Any], - trajectory: dict[str, Any], - *, - agent_name: str, - **kwargs, + self, + sampling_params: dict[str, Any], + trajectory: dict[str, Any], + *, + agent_name: str, + **kwargs, ) -> _InternalAgentLoopOutput: with rollout_trace_attr( - step=trajectory["step"], - sample_index=trajectory["sample_index"], - rollout_n=trajectory["rollout_n"], - validate=trajectory["validate"], - name="agent_loop", + step=trajectory["step"], + sample_index=trajectory["sample_index"], + rollout_n=trajectory["rollout_n"], + validate=trajectory["validate"], + name="agent_loop", ): assert agent_name in _agent_loop_registry, ( f"Agent loop {agent_name} not registered, registered agent loops: {_agent_loop_registry.keys()}" @@ -553,8 +553,8 @@ async def _run_agent_loop( # TODO: support other multi-modal inputs multi_modal_inputs = None if ( - self.processor is not None - and "Qwen2VLImageProcessor" in self.processor.image_processor.__class__.__name__ + self.processor is not None + and "Qwen2VLImageProcessor" in self.processor.image_processor.__class__.__name__ ): from verl.models.transformers.qwen2_vl import get_rope_index @@ -583,8 +583,8 @@ async def _run_agent_loop( else: position_ids = compute_position_id_with_mask(attention_mask) # (1, seq_len) enable_async_reward = ( - self.rm_executor is not None and self.config.reward_model.enable_resource_pool - ) or not self.config.reward_model.enable + self.rm_executor is not None and self.config.reward_model.enable_resource_pool + ) or not self.config.reward_model.enable if output.reward_score is None and enable_async_reward: batch = TensorDict( { @@ -691,10 +691,12 @@ def _postprocess(self, inputs: list[_InternalAgentLoopOutput]) -> DataProto: @ray.remote class AgentLoopWorker(AgentLoopWorkerBase): - def __init__(self, config: DictConfig, server_handles: list[ray.actor.ActorHandle], - rm_executor: BatchExecutor = None): + def __init__( + self, config: DictConfig, server_handles: list[ray.actor.ActorHandle], rm_executor: BatchExecutor = None + ): super().__init__(config, server_handles, rm_executor) + async def get_trajectory_info(step, index, validate): """Get trajectory info. @@ -859,7 +861,7 @@ def _performance_metrics(self, metrics: list[list[dict[str, str]]], output: Data return timing - async def wake_up(self): + def wake_up(self): """Wake up all rollout replica instances.""" self._run_all([replica.wake_up() for replica in self.rollout_replicas]) diff --git a/verl/workers/rollout/vllm_rollout/vllm_async_server.py b/verl/workers/rollout/vllm_rollout/vllm_async_server.py index 02c02417744..c4feae92c3f 100644 --- a/verl/workers/rollout/vllm_rollout/vllm_async_server.py +++ b/verl/workers/rollout/vllm_rollout/vllm_async_server.py @@ -328,6 +328,10 @@ async def sleep(self): class vLLMReplica(RolloutReplica): + def __init__(self, replica_rank: int, config: DictConfig, gpus_per_node: int = 8): + super().__init__(replica_rank, config, gpus_per_node) + self.server_class = vLLMHttpServer + def get_ray_class_with_init_args(self) -> RayClassWithInitArgs: """Get rollout worker actor class for colocated and standalone mode.""" worker_dict_cls = RayClassWithInitArgs( @@ -362,7 +366,7 @@ async def launch_servers(self): for node_rank in range(nnodes): workers = self.workers[node_rank * gpus_per_node : (node_rank + 1) * gpus_per_node] node_id = worker_node_ids[node_rank * gpus_per_node] - server = vLLMHttpServer.options( + server = self.server_class.options( scheduling_strategy=ray.util.scheduling_strategies.NodeAffinitySchedulingStrategy( node_id=node_id, soft=False, From 3898c5fc4666cf10665d3cf9289a7355abcaf8f3 Mon Sep 17 00:00:00 2001 From: wangshulin02 <953550366@qq.com> Date: Wed, 17 Sep 2025 15:52:52 +0800 Subject: [PATCH 148/182] translate --- .../agent_loop/agent_loop.py | 13 +- .../partial_single_turn_agent_loop.py | 10 +- recipe/fully_async_policy/detach_utils.py | 74 ++++----- recipe/fully_async_policy/fsdp_workers.py | 2 +- .../fully_async_rollouter.py | 148 ++++++++---------- .../fully_async_policy/fully_async_trainer.py | 5 +- recipe/fully_async_policy/ray_trainer.py | 5 +- .../vllm_rollout/vllm_async_server.py | 6 +- 8 files changed, 124 insertions(+), 139 deletions(-) diff --git a/recipe/fully_async_policy/agent_loop/agent_loop.py b/recipe/fully_async_policy/agent_loop/agent_loop.py index 901e584f7c3..db29229915d 100644 --- a/recipe/fully_async_policy/agent_loop/agent_loop.py +++ b/recipe/fully_async_policy/agent_loop/agent_loop.py @@ -614,23 +614,24 @@ async def generate_single_sample_async( partial_output_list: Optional[list[AgentLoopOutput]], ) -> list[AgentLoopOutput]: """ - 异步处理单个样本, 需要复制n次 + Asynchronously process a single sample Args: - sample: 单个样本数据 + sample: Single sample data partial_output_list: Optional[List[AgentLoopOutput]]: already rollout result. Returns: - tuple[AgentLoopOutput, float]: 处理结果和处理时间 + list[AgentLoopOutput]: Processing results """ - # 使用负载均衡选择 worker + # select a worker worker = self._select_best_worker() - # 异步处理单个样本 - 使用无后处理版本获取原始AgentLoopOutput + # Process a single sample asynchronously, + # get the raw AgentLoopOutput using the no post-processing version output_future = worker.generate_sequences_no_post.remote(sample, param_version, partial_output_list) return await asyncio.wrap_future(output_future.future()) def _select_best_worker(self): - """选择最佳的 worker(简单的轮询负载均衡)""" + """Select the best worker, simple round-robin load balancing""" if not hasattr(self, "_worker_index"): self._worker_index = 0 diff --git a/recipe/fully_async_policy/agent_loop/partial_single_turn_agent_loop.py b/recipe/fully_async_policy/agent_loop/partial_single_turn_agent_loop.py index c97f794bb9c..25964406753 100644 --- a/recipe/fully_async_policy/agent_loop/partial_single_turn_agent_loop.py +++ b/recipe/fully_async_policy/agent_loop/partial_single_turn_agent_loop.py @@ -49,12 +49,16 @@ async def run( param_version_start = param_version else: if output.is_cancel: - # 恢复暂停的样本,结果直接添加到 prompt_ids 后面 + # Resume the paused sample, + # add the result directly after prompt_ids, + # and reset generate_sequences metric prompt_ids = output.prompt_ids + output.response_ids metrics["generate_sequences"] = output.metrics.generate_sequences param_version_start = output.param_version_start else: - # 同一批样本,部分cancel,部分没有cancel, 没有cancel的样本直接返回 + # In the same batch of samples, + # ome are canceled and some are not. + # The samples without partial rollout are returned directly. return output param_version_end = param_version request_id = uuid4().hex @@ -65,8 +69,8 @@ async def run( if not output: response_mask = [1] * len(response_ids) - # 暂停待恢复样本, 把输出结果加到 response_ids 后,并重置 response_mask else: + # Pause the sample to be resumed, add the output result to response_ids, and reset response_mask prompt_ids = output.prompt_ids log_probs = output.log_probs + log_probs response_ids = output.response_ids + response_ids diff --git a/recipe/fully_async_policy/detach_utils.py b/recipe/fully_async_policy/detach_utils.py index 32a36bb882a..35128a4ade1 100644 --- a/recipe/fully_async_policy/detach_utils.py +++ b/recipe/fully_async_policy/detach_utils.py @@ -53,6 +53,8 @@ class RolloutSample: @dataclass class ValidateMetrics: + """Metrics for validation""" + timing_raw: dict[str, Any] metrics: Optional[dict[str, Any]] = None global_steps: Optional[int] = None @@ -61,23 +63,15 @@ class ValidateMetrics: def prepare_single_generation_data(batch_dict, global_steps, rollout_n) -> DataProto: """ - 类似 ray_trainer._prepare_generate_batch 的逻辑,但针对单个样本 - 分离出用于生成的数据和需要保留的原始数据 + Similar to the logic of ray_trainer._prepare_generate_batch, but for a single sample. + Separate the data used for generation from the original data. Returns: tuple: (original_batch_dict, gen_data_for_single_sample) """ - # 创建完整的 DataProto full_batch = DataProto.from_single_dict(batch_dict) - # batch : TensorDict { input_ids, attention_mask, position_ids} - # non_tensor_batch: raw_prompt_ids, raw_prompt, - # multi_modal_data, tools_kwargs, interaction_kwargs, index, agent_name, - # data_source, ability, reward_model - # meta_info: {} - - # 定义需要传递给生成服务器的字段 batch_keys_to_pop = ["input_ids", "attention_mask", "position_ids"] non_tensor_batch_keys_to_pop = ["raw_prompt_ids"] @@ -86,10 +80,10 @@ def prepare_single_generation_data(batch_dict, global_steps, rollout_n) -> DataP non_tensor_batch_keys=non_tensor_batch_keys_to_pop, ) - # 设置使用支持partial的agent + # Setting agent - partial_single_turn_agent, that supports partial full_batch.non_tensor_batch["agent_name"] = np.array(["partial_single_turn_agent"] * len(full_batch), dtype=object) - # 添加全局步数到生成数据 + # Add global step count to generated data full_batch.meta_info["global_steps"] = global_steps full_batch = full_batch.repeat(repeat_times=rollout_n, interleave=True) return full_batch @@ -97,32 +91,29 @@ def prepare_single_generation_data(batch_dict, global_steps, rollout_n) -> DataP def process_rollout_log_probs(data_proto: DataProto, rollout_log_probs: list[list[float]]) -> torch.Tensor: """ - 根据 DataProto 中的 mask 逻辑处理 rollout_log_probs - # attention_mask: [0,0,0,0,1,1,1,1, | 1,1,1,0,0,0,0,0] + Process rollout_log_probs according to the mask in DataProto + mask: [0,0,0,0,1,1,1,1, | 1,1,1,0,0,0,0,0] Args: - data_proto: 包含 batch 信息的 DataProto 对象 - rollout_log_probs: 二维列表,每个子列表包含一个样本的 log_probs + data_proto: A DataProto object containing batch information + rollout_log_probs: A two-dimensional list, each sublist containing the log_probs of a sample Returns: - torch.Tensor: 处理后的 log_probs tensor,形状为 [bsz, response_length] + torch.Tensor: The processed log_probs tensor, with shape: [bsz, response_length] """ batch = data_proto.batch response_mask = batch["response_mask"] - bsz, response_length = response_mask.shape - - # 初始化结果 tensor - rollout_log_probs_tensor = torch.zeros((bsz, response_length), dtype=torch.float32) - 1 + rollout_log_probs_tensor = torch.zeros(response_mask.shape, dtype=torch.float32) - 1 for i, log_probs_seq in enumerate(rollout_log_probs): - # 获取当前样本的有效长度(mask 中为 1 的位置数量) + # Get the effective length of the current sample (the number of positions with 1 in the mask) valid_length = response_mask[i].sum().item() - # 确保 log_probs_seq 的长度不超过有效长度 + # Ensure that the length of log_probs_seq does not exceed the valid length actual_length = min(len(log_probs_seq), valid_length) - # 将 log_probs 填入对应位置 + # Fill log_probs into the corresponding position if actual_length > 0: rollout_log_probs_tensor[i, :actual_length] = torch.tensor(log_probs_seq[:actual_length]) @@ -131,29 +122,32 @@ def process_rollout_log_probs(data_proto: DataProto, rollout_log_probs: list[lis def merge_rollout_sample(config, tokenizer, rs: RolloutSample): - # 第一步:从 AgentLoopOutput 创建生成结果的 DataProto + """ + Supplement and refine the RolloutSample object, + """ + # Step 1: Create a DataProto from the AgentLoopOutput to generate the result gen_batch_output = postprocess_agent_loop_outputs(rs.agent_loop_output_list, tokenizer, config) rollout_log_probs = [x.log_probs for x in rs.agent_loop_output_list] rollout_log_probs = process_rollout_log_probs(gen_batch_output, rollout_log_probs) gen_batch_output.batch["rollout_log_probs"] = rollout_log_probs.to(torch.float32) - # 第二步:添加 uid + # Step 2: Add uid rs.full_batch.non_tensor_batch["uid"] = np.array([f"uid_{rs.sample_id}"] * len(rs.full_batch), dtype=object) - # 第二步:合并batch - # 将 original_batch 的 non_tensor_batch 和 meta_info 合并到 final_batch + # Step 2: Merge batches + # Merge the non_tensor_batch and meta_info of original_batch into final_batch for key, value in rs.full_batch.non_tensor_batch.items(): gen_batch_output.non_tensor_batch[key] = value gen_batch_output.meta_info.update(rs.full_batch.meta_info) - # 第三步,设置 full_batch + # Step 3, set full_batch rs.full_batch = gen_batch_output rs.processing_times = [] for agent_loop in rs.agent_loop_output_list: rs.processing_times.append(agent_loop.metrics.generate_sequences) rs.param_version_start = [agent_loop.param_version_start for agent_loop in rs.agent_loop_output_list] rs.param_version_end = [agent_loop.param_version_end for agent_loop in rs.agent_loop_output_list] - # 第四步,清空 agent_loop_output_list + # Step 4, clear agent_loop_output_list rs.agent_loop_output_list = [] return rs @@ -164,7 +158,7 @@ def assemble_batch_from_rollout_samples( ) -> DataProto: """ Assemble gen_batch_output from RolloutSample objects - 从 RolloutSample 对象中组装批次,类似 ray_trainer 的 _post_generate_batch 逻辑 + Assembles batches from RolloutSample objects, similar to the _post_generate_batch logic in ray_trainer. Args: rollout_samples: List of RolloutSample objects @@ -188,7 +182,7 @@ def assemble_batch_from_rollout_samples( rollout_samples_batch = [] processing_times = [] rollout_status = rollout_samples[0].rollout_status - # 为 rollout_status 的所有 key 添加前缀 + # Add a prefix to all rollout_status keys rollout_status = {f"fully_async/{key}": value for key, value in rollout_status.items()} for rs in rollout_samples: @@ -196,18 +190,18 @@ def assemble_batch_from_rollout_samples( processing_times.extend(rs.processing_times) final_batch = DataProto.concat(rollout_samples_batch) - # 计算 response_mask(如果不存在) + # Calculate response_mask (if not present) if "response_mask" not in final_batch.batch.keys(): final_batch.batch["response_mask"] = compute_response_mask(final_batch) if balance_batch: balance_batch(final_batch, metrics={}) - # 计算全局有效 token 数 + # Calculate the global valid token number if "attention_mask" in final_batch.batch: final_batch.meta_info["global_token_num"] = torch.sum(final_batch.batch["attention_mask"], dim=-1).tolist() - # 收集统计信息和元数据(直接从 RolloutSample 中获取) + # Collect statistics param_versions = [rs.param_version for rs in rollout_samples] trajectorys_param_versions = [version for rs in rollout_samples for version in rs.param_version_end] @@ -215,9 +209,9 @@ def assemble_batch_from_rollout_samples( "processing_time/avg": np.mean(processing_times), "processing_time/max": np.max(processing_times), "processing_time/min": np.min(processing_times), - "processing_time/tp50": np.percentile(processing_times, 50), # 中位数 - "processing_time/tp99": np.percentile(processing_times, 99), # 99百分位 - "processing_time/tp95": np.percentile(processing_times, 95), # 95百分位也很有用 + "processing_time/tp50": np.percentile(processing_times, 50), + "processing_time/tp99": np.percentile(processing_times, 99), + "processing_time/tp95": np.percentile(processing_times, 95), } processing_time_stats = {f"fully_async/{key}": value for key, value in processing_time_stats.items()} @@ -228,7 +222,7 @@ def assemble_batch_from_rollout_samples( "fully_async/partial/partial_ratio": (len(param_version_diff) - num_diff0) / len(param_version_diff), "fully_async/partial/max_partial_span": max(param_version_diff), } - # 创建 meta_info + # add meta_info final_batch.meta_info.update( { "rollout_param_versions": param_versions, @@ -274,7 +268,7 @@ def _init_aggregation_rules(self) -> dict[str, dict[str, list[str]]]: "fully_async/count/stale_trajectory_processed", "fully_async/count/current_param_version", "fully_async/count/dropped_stale_samples", - "training/global_step", # TODO 改为total_step + "training/global_step", # TODO change name to: total_step ], } diff --git a/recipe/fully_async_policy/fsdp_workers.py b/recipe/fully_async_policy/fsdp_workers.py index 7a1b59aa64c..9e4f96c2e8b 100644 --- a/recipe/fully_async_policy/fsdp_workers.py +++ b/recipe/fully_async_policy/fsdp_workers.py @@ -55,7 +55,7 @@ def get_inference_model(rollout): Args: rollout: rollout object Returns: - model: 模型对象 + model: model object """ inference_engine = rollout.inference_engine if hasattr(inference_engine, "llm_engine"): diff --git a/recipe/fully_async_policy/fully_async_rollouter.py b/recipe/fully_async_policy/fully_async_rollouter.py index 1027e228c18..81ebf9780f8 100644 --- a/recipe/fully_async_policy/fully_async_rollouter.py +++ b/recipe/fully_async_policy/fully_async_rollouter.py @@ -115,7 +115,6 @@ def __init__( self.staleness_threshold: float = config.async_training.get("staleness_threshold", 1) self.required_samples = None self.max_required_samples = None - # 单次最多扔一次更新需要的样本 self.max_concurrent_samples = None # queue size self.max_queue_size = None @@ -125,22 +124,22 @@ def __init__( self.total_generated_samples = 0 self.staleness_samples = 0 self.dropped_stale_samples = 0 - self.processed_sample_count = 0 # 已处理的样本计数 + self.processed_sample_count = 0 self.global_steps = 0 self.idle_start_time = None self.version_start_time = None # Concurrency control + # Modified by self.pause() or self._should_pause_generation() self.paused = False self.running = True - # 通过 pause 和 resume 控制 monitor_loop 中,是否进行 尝试恢复 操作 self.monitor_loop_trigger = True # Initialize async locks directly self.lock = asyncio.Lock() self.condition = asyncio.Condition(self.lock) - # 初始化异步队列 + # Initialize async queues self.pending_queue = asyncio.Queue(maxsize=128) self.active_tasks = set() self.result_queue = asyncio.Queue() @@ -164,7 +163,7 @@ async def set_required_samples(self, required_samples: int): / (self.required_samples * self.config.async_training.trigger_parameter_sync_step) ) - # 单次最多扔一次更新需要的样本 + # max_concurrent_samples should be related to the resources self.max_concurrent_samples = self.async_rollout_manager.rollout_dp_size * 16 self.max_concurrent_samples = min(self.max_concurrent_samples, self.max_required_samples) self.max_queue_size = self.max_required_samples @@ -275,12 +274,12 @@ def _init_async_rollout_manager(self): worker_group=self.rollout_wg, ) - # 添加样本到待处理队列的协程 + # Add samples to the pending_queue async def _feed_samples(self): continuous_iterator = self._create_continuous_iterator() for epoch, batch_dict in continuous_iterator: - # 类似 _prepare_generate_batch 的逻辑:分离数据 + # Similar to _prepare_generate_batch: Separate data full_batch = prepare_single_generation_data( batch_dict, self.global_steps, self.config.actor_rollout_ref.rollout.n ) @@ -289,10 +288,10 @@ async def _feed_samples(self): rollout_sample = RolloutSample( full_batch=full_batch, - agent_loop_output_list=[None] * self.config.actor_rollout_ref.rollout.n, # 待处理后填充 + agent_loop_output_list=[None] * self.config.actor_rollout_ref.rollout.n, sample_id=sample_id, epoch=epoch, - param_version=0, # 待处理后填充 + param_version=0, param_version_start=[], param_version_end=[], processing_times=[], @@ -301,23 +300,25 @@ async def _feed_samples(self): await self.pending_queue.put(rollout_sample) - # 检查是否到达最后一步 + # Check if have reached the last step if self.global_steps >= self.total_rollout_steps: print( f"[FullyAsyncRollouter][Feed] " - f"达到最大步数,停止添加新样本 " + f"Maximum count has been reached, stop adding new samples" f"{self.global_steps} >= {self.total_rollout_steps}" ) break self.global_steps += 1 - # 发送结束信号 + # End signal await self.pending_queue.put("DONE") - print(f"[FullyAsyncRollouter][Feed] 样本添加完成,总共添加了 {self.global_steps} 个步骤的样本") + print(f"[FullyAsyncRollouter][Feed] Sample addition is complete, {self.global_steps} samples have been added") async def _processor_worker(self): - """流式处理工作协程 - 逐个样本立即提交处理,不等待批次""" + """ + Streaming worker coroutines, a sample is submitted for processing without waiting for batches + """ while True: simple_from_cancel_queue = False @@ -328,15 +329,15 @@ async def _processor_worker(self): rollout_sample = await self.pending_queue.get() self.staleness_samples += 1 - # 判断是否需要暂停 - # self.paused 由 pause() 和 self._should_pause_generation() 负责修改 if self.paused or await self._should_pause_generation(): - print("[FullyAsyncRollouter][Processor] 收到暂停信号,等待剩余任务完成...") + print( + "[FullyAsyncRollouter][Processor] Received pause signal, waiting for remaining tasks to return..." + ) async with self.lock: self.paused = True while self.active_tasks: async with self.lock: - # 获取锁后,active_tasks 数量会发生变化,需要再次校验 + # After acquiring the lock, the number of active_tasks may change, need to be verified again if self.active_tasks: done_tasks, self.active_tasks = await asyncio.wait( self.active_tasks, return_when=asyncio.FIRST_COMPLETED @@ -349,9 +350,10 @@ async def _processor_worker(self): self.idle_start_time = time.time() await self.condition.wait() - # 获取待处理的部分 RolloutSample if rollout_sample == "DONE": - print("[FullyAsyncRollouter][Processor] 收到结束信号,等待剩余任务完成...") + print( + "[FullyAsyncRollouter][Processor] Received end signal, waiting for the remaining tasks to complete..." + ) while self.active_tasks: async with self.lock: if self.active_tasks: @@ -362,7 +364,7 @@ async def _processor_worker(self): await task break - # 检查并发数是否超限 + # Check whether the number of concurrent tasks exceeds the limit while len(self.active_tasks) >= self.max_concurrent_samples: async with self.lock: if self.active_tasks: @@ -372,9 +374,10 @@ async def _processor_worker(self): for task in done_tasks: await task - # 立即提交单个样本处理 + # Submit single sample processing async with self.lock: - # pause结束后,获取到锁,还需要判断是否是暂停阶段,否则继续等待 + # After the pause is over, the lock is acquired and it is necessary + # to determine whether it is the pause phase, otherwise continue to wait while self.paused: await self.condition.wait() task = asyncio.create_task( @@ -383,41 +386,29 @@ async def _processor_worker(self): ) self.active_tasks.add(task) - # 标记队列任务完成 if simple_from_cancel_queue: self.cancel_queue.task_done() else: self.pending_queue.task_done() async def _process_single_sample_streaming(self, rollout_sample: RolloutSample): - """流式处理单个样本""" - # 调用异步生成方法 + """Process a single sample streamingly""" + # Calling asynchronous generation methods agent_loop_output_list = await self.async_rollout_manager.generate_single_sample_async( rollout_sample.full_batch, self.current_param_version, rollout_sample.agent_loop_output_list ) - # 直接更新 RolloutSample 对象,填充剩余字段 rollout_sample.agent_loop_output_list = agent_loop_output_list is_cancel = False - # 收集所有信息 for agent_loop in agent_loop_output_list: if not is_cancel and agent_loop.is_cancel: is_cancel = True - # rollout_data = { - # "cost": [f"{agent_loop.metrics.generate_sequences:.2f}s" for agent_loop in agent_loop_output_list], - # "len": [len(agent_loop.response_ids) for agent_loop in agent_loop_output_list], - # } - # if is_cancel: - # rollout_data["cancel"] = [agent_loop.is_cancel for agent_loop in agent_loop_output_list] - # formatted_data = pformat(rollout_data, width=200, compact=True) - # print(f"[FullyAsyncRollouter] rollout {rollout_sample.sample_id} {formatted_data}") - if is_cancel: - # 放入 cancel 队列中,等待恢复生成 + # Put in the cancel queue and wait for the generation to resume await self.cancel_queue.put(rollout_sample) else: - # 否则放入结果队列 + # put into the result_queue rollout_sample.param_version = self.current_param_version rollout_sample.rollout_status = await self.get_statistics() await self.result_queue.put(rollout_sample) @@ -425,13 +416,15 @@ async def _process_single_sample_streaming(self, rollout_sample: RolloutSample): self.processed_sample_count += 1 async def _consumer_worker(self): - """消费者协程,负责从结果队列获取处理结果并放入消息队列""" + """ + The consumer coroutine is responsible for obtaining the processing results + from the result queue and putting them into the message queue + """ while True: - # 从结果队列获取 RolloutSample rollout_sample = await self.result_queue.get() rollout_sample = merge_rollout_sample(self.config, self.tokenizer, rollout_sample) - # 将 RolloutSample 放入消息队列 + # Put RolloutSample into the message queue success = await self.message_queue_client.put_sample( sample=ray.cloudpickle.dumps(rollout_sample), param_version=rollout_sample.param_version, @@ -441,55 +434,50 @@ async def _consumer_worker(self): else: self.dropped_stale_samples += 1 - # 标记结果队列任务完成 self.result_queue.task_done() async def _streaming_generation_main(self): - """流式处理的主入口方法,包含初始化和验证逻辑""" + """The main entry method for stream processing""" # we start from step 1 self.global_steps += 1 - # 确保async_rollout_manager已经初始化 if self.async_rollout_manager is None: self._init_async_rollout_manager() - # 启动流式处理循环 - print(f"[FullyAsyncRollouter] 启动流式处理模式,最大并发样本数: {self.max_concurrent_samples}") + # Start the streaming loop + print(f"[FullyAsyncRollouter] Start streaming mode, maximum concurrent samples: {self.max_concurrent_samples}") - # 启动流式处理协程和消费者协程 + # Start sample feed coroutine, streaming process coroutine and consumer coroutine self.feed_task = asyncio.create_task(self._feed_samples()) self.processor_task = asyncio.create_task(self._processor_worker()) self.consumer_task = asyncio.create_task(self._consumer_worker()) - # 启动样本添加协程 try: - # 等待样本添加完成 + # Wait for sample feed to complete await self.feed_task - print("[FullyAsyncRollouter] 样本添加完成") + print("[FullyAsyncRollouter] Sample feed completed") - # 等待流式处理完成 + # Wait for streaming to complete await self.processor_task - print("[FullyAsyncRollouter] 流式处理完成") + print("[FullyAsyncRollouter] Streaming process completed") - # 等待结果队列清空 + # Waiting for the result queue to clear await self.result_queue.join() - print("[FullyAsyncRollouter] 所有结果处理完成") + print("[FullyAsyncRollouter] Result queue cleared") except Exception as e: - print(f"[FullyAsyncRollouter] 流式处理异常: {e}") + print(f"[FullyAsyncRollouter] Streaming process exception:{e}") finally: - # 取消所有任务 if self.processor_task: self.processor_task.cancel() if self.consumer_task: self.consumer_task.cancel() - # 等待任务结束 await asyncio.gather(self.processor_task, self.consumer_task, return_exceptions=True) - # 发送终止信号 + # Send a finish signal await self.message_queue_client.put_sample( sample=None, param_version=self.current_param_version, @@ -501,35 +489,35 @@ async def _streaming_generation_main(self): async def fit(self): """ Start the async rollouter - entry point that sets up and runs async tasks - Main async fit method that coordinates all coroutines""" + Main async fit method that coordinates all coroutines + """ print("[FullyAsyncRollouter] Starting FullyAsyncRollouter...") if self.message_queue_client is None: raise ValueError("MessageQueue client not set. Call set_message_queue_client() first.") - # 设置运行状态 + # Set the running status flag async with self.lock: self.paused = False self.running = True - # 创建主要的异步任务 + # Create the main asynchronous task generation_task = asyncio.create_task(self._streaming_generation_main()) monitor_task = asyncio.create_task(self._async_monitor_loop()) try: - # 并发运行生成和监控任务 + # Run build and monitoring tasks concurrently await asyncio.gather(generation_task, monitor_task, return_exceptions=True) except Exception as e: - print(f"[FullyAsyncRollouter] 异步任务执行出错: {e}") + print(f"[FullyAsyncRollouter] Asynchronous task execution error: {e}") finally: - # 清理任务 if not generation_task.done(): generation_task.cancel() if not monitor_task.done(): monitor_task.cancel() - # 等待任务完成 + # Wait for the task to complete await asyncio.gather(generation_task, monitor_task, return_exceptions=True) print("[FullyAsyncRollouter] Rollouter fit completed") @@ -549,14 +537,14 @@ async def _async_monitor_loop(self): if not self.running: break await asyncio.sleep(check_interval) - # 定期打印统计信息 + # Print statistics periodically current_time = time.time() if current_time - last_stats_time >= stats_interval: stats = await self.get_statistics() print(f"[FullyAsyncRollouter][MonitorLoop][Statistics] {pformat(stats)}") last_stats_time = current_time - # pause 和 resume 之间,不进行恢复操作 + # Trigger rollout recovery if self.monitor_loop_trigger: if not await self._should_pause_generation(): async with self.lock: @@ -594,7 +582,7 @@ async def pause(self): print("[FullyAsyncRollouter][Public][Pause]") async with self.lock: self.paused = True - # 取消rollout所有任务 + # Cancel all rollout tasks if self.config.async_training.partial_rollout: await self.async_rollout_manager.cancel_async() if self.active_tasks: @@ -619,23 +607,23 @@ async def get_statistics(self) -> dict: queue_stats = self.message_queue_client.get_statistics_sync() stats = { - # static stats - "static/max_required_samples": self.max_required_samples, - "static/required_samples": self.required_samples, - "static/staleness_threshold": self.staleness_threshold, - "static/max_queue_size": self.max_queue_size, - "static/max_concurrent_samples": self.max_concurrent_samples, - # counting stats - "count/current_param_version": self.current_param_version, - "count/total_generated_samples": self.total_generated_samples, - "count/staleness_samples": self.staleness_samples, - "count/dropped_stale_samples": self.dropped_stale_samples, # monitor stats "monitor/active_tasks_size": len(self.active_tasks), "monitor/queue/pending_queue_size": self.pending_queue.qsize(), "monitor/queue/cancel_queue_size": self.cancel_queue.qsize(), "monitor/queue/result_queue_size": self.result_queue.qsize(), "monitor/queue/mq_queue_size": queue_stats["queue_size"], + # counting stats + "count/current_param_version": self.current_param_version, + "count/total_generated_samples": self.total_generated_samples, + "count/staleness_samples": self.staleness_samples, + "count/dropped_stale_samples": self.dropped_stale_samples, + # static stats + "static/max_required_samples": self.max_required_samples, + "static/required_samples": self.required_samples, + "static/staleness_threshold": self.staleness_threshold, + "static/max_queue_size": self.max_queue_size, + "static/max_concurrent_samples": self.max_concurrent_samples, } return stats diff --git a/recipe/fully_async_policy/fully_async_trainer.py b/recipe/fully_async_policy/fully_async_trainer.py index f014993f13e..c3d5773bde2 100644 --- a/recipe/fully_async_policy/fully_async_trainer.py +++ b/recipe/fully_async_policy/fully_async_trainer.py @@ -158,11 +158,10 @@ def _get_samples_from_queue(self) -> tuple[None, None] | tuple[int, Any]: queue_samples = [] while len(queue_samples) < self.required_samples: - # 获取单个样本,会一直等待直到有样本或收到None + # Get a single sample and wait until there is a sample or None is received sample, queue_len = self.message_queue_client.get_sample_sync() if sample is None: - # 检测到结束信号(None),立即退出 print( f"[FullyAsyncTrainer] Detected termination signal (None), stopping sample collection. " f"Collected {len(queue_samples)}/{self.required_samples} samples" @@ -322,7 +321,7 @@ def fit(self): pprint(f"[FullyAsyncTrainer] Final validation metrics: {val_data.metrics}") self.progress_bar.close() - self._check_save_checkpoint(True, timing_raw) # TODO: 检查checkpoint + self._check_save_checkpoint(True, timing_raw) # TODO: check checkpoint def load_checkpoint(self): return self._load_checkpoint() diff --git a/recipe/fully_async_policy/ray_trainer.py b/recipe/fully_async_policy/ray_trainer.py index dea3aa2c26e..c0543191e8e 100644 --- a/recipe/fully_async_policy/ray_trainer.py +++ b/recipe/fully_async_policy/ray_trainer.py @@ -80,11 +80,10 @@ class Role(Enum): ActorRolloutRef = 6 def __str__(self): - """返回与代码中一致的字符串表示""" return self._get_role_string() def _get_role_string(self): - """获取角色对应的字符串名称""" + """Get the string name corresponding to the role""" role_mapping = { Role.Actor: "actor", Role.Rollout: "rollout", @@ -98,7 +97,7 @@ def _get_role_string(self): @classmethod def from_string(cls, name: str): - """从字符串创建Role实例""" + """Create a Role instance from a string""" string_mapping = { "actor": cls.Actor, "rollout": cls.Rollout, diff --git a/recipe/fully_async_policy/vllm_rollout/vllm_async_server.py b/recipe/fully_async_policy/vllm_rollout/vllm_async_server.py index 4826ebaa1d0..2f2cef94a0e 100644 --- a/recipe/fully_async_policy/vllm_rollout/vllm_async_server.py +++ b/recipe/fully_async_policy/vllm_rollout/vllm_async_server.py @@ -349,10 +349,9 @@ async def _generate_step(self, prompt_ids: list[int], sampling_params: dict[str, async def generate_for_partial( self, prompt_ids: list[int], sampling_params: dict[str, Any], request_id: str ) -> tuple[list[Any], list[Any], bool] | tuple[Sequence[int], list[float], Any]: - # 设置中断标志 async with self.lock: if self.paused: - # cancel 后, 所有任务直接返回,等待下次提交 + # After cancel, all tasks will return directly and wait for the next submission return [], [], True self.cancel_event[request_id] = asyncio.Event() cancel_handle = asyncio.create_task(self.cancel_event[request_id].wait()) @@ -370,7 +369,8 @@ async def generate_for_partial( token_ids = self.req_output[request_id].outputs[0].token_ids log_probs: list[float] = [] for i, x in enumerate(self.req_output[request_id].outputs[0].logprobs): - # sampling_params 中 logprobs 设置为1,应该返回1个, 但是实测会有多个,取token_id所对应的log_prob + # In sampling_params, logprobs is set to 1, which should return 1, + # but in practice there are multiple. Take the log_prob corresponding to token_id token_id = self.req_output[request_id].outputs[0].token_ids[i] log_probs.append(x[token_id].logprob) is_cancel = generation_handle not in done From 106f5eb75adb1f0e3614d8a5d2d807cdd301effb Mon Sep 17 00:00:00 2001 From: wangshulin02 <953550366@qq.com> Date: Wed, 17 Sep 2025 16:11:46 +0800 Subject: [PATCH 149/182] fix typo --- recipe/fully_async_policy/fully_async_rollouter.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/recipe/fully_async_policy/fully_async_rollouter.py b/recipe/fully_async_policy/fully_async_rollouter.py index 81ebf9780f8..dcc84f81993 100644 --- a/recipe/fully_async_policy/fully_async_rollouter.py +++ b/recipe/fully_async_policy/fully_async_rollouter.py @@ -352,7 +352,7 @@ async def _processor_worker(self): if rollout_sample == "DONE": print( - "[FullyAsyncRollouter][Processor] Received end signal, waiting for the remaining tasks to complete..." + "[FullyAsyncRollouter][Processor] Received end signal, waiting for remaining tasks to complete..." ) while self.active_tasks: async with self.lock: From c2219e0938c5b333a0c568ab51ce9e694099355f Mon Sep 17 00:00:00 2001 From: ArronHZG Date: Thu, 18 Sep 2025 11:29:31 +0800 Subject: [PATCH 150/182] qwen3-32B-sta0 --- .../exp/qwen2-32B_128/fsdp2_fully-async_64-64/run.sh | 4 ++-- .../qwen2-32B_128/fsdp2_fully-async_64-64/runtime_env.yaml | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/recipe/fully_async_policy/exp/qwen2-32B_128/fsdp2_fully-async_64-64/run.sh b/recipe/fully_async_policy/exp/qwen2-32B_128/fsdp2_fully-async_64-64/run.sh index 48be3ab3c84..c79c960a701 100644 --- a/recipe/fully_async_policy/exp/qwen2-32B_128/fsdp2_fully-async_64-64/run.sh +++ b/recipe/fully_async_policy/exp/qwen2-32B_128/fsdp2_fully-async_64-64/run.sh @@ -65,9 +65,9 @@ n_resp_per_prompt=16 train_prompt_mini_bsz=16 total_rollout_steps=$(((512*200))) test_freq=20 -staleness_threshold=0.1 +staleness_threshold=0 trigger_parameter_sync_step=8 -partial_rollout=True +partial_rollout=False python -m recipe.fully_async_policy.fully_async_main \ data.train_files="${TRAIN_FILE}" \ diff --git a/recipe/fully_async_policy/exp/qwen2-32B_128/fsdp2_fully-async_64-64/runtime_env.yaml b/recipe/fully_async_policy/exp/qwen2-32B_128/fsdp2_fully-async_64-64/runtime_env.yaml index ea506be787e..61c0adbaca7 100644 --- a/recipe/fully_async_policy/exp/qwen2-32B_128/fsdp2_fully-async_64-64/runtime_env.yaml +++ b/recipe/fully_async_policy/exp/qwen2-32B_128/fsdp2_fully-async_64-64/runtime_env.yaml @@ -1,4 +1,4 @@ env_vars: VLLM_USE_V1: "1" - TENSORBOARD_DIR: "/mnt/dolphinfs/ssd_pool/docker/user/hadoop-friday-studio/FTI/houzhenggang/expeirments/qwen2-32B-128/dapo_qwen2-32B_20k_fsdp2_fully-async_64-64-tps1" + TENSORBOARD_DIR: "/mnt/dolphinfs/ssd_pool/docker/user/hadoop-friday-studio/FTI/houzhenggang/expeirments/qwen2-32B-128/dapo_qwen2-32B_20k_fsdp2_fully-async_64-64-sta0" HYDRA_FULL_ERROR: "1" \ No newline at end of file From 91d199c9dd941ff03ce9415956aff3d0172897b9 Mon Sep 17 00:00:00 2001 From: ArronHZG Date: Thu, 18 Sep 2025 16:07:59 +0800 Subject: [PATCH 151/182] refactor 5 --- .../grpo_trainer/run_qwen2-7b_seq_balance.sh | 6 +- .../agent_loop/agent_loop.py | 175 +++--------------- .../partial_single_turn_agent_loop.py | 20 +- recipe/fully_async_policy/detach_utils.py | 1 - .../fully_async_rollouter.py | 12 +- .../fully_async_policy/fully_async_trainer.py | 1 + recipe/fully_async_policy/ray_trainer.py | 5 - .../vllm_rollout/vllm_async_server.py | 2 +- tests/special_e2e/run_fully_async_policy.sh | 2 +- verl/experimental/agent_loop/agent_loop.py | 7 +- verl/workers/rollout/replica.py | 1 - 11 files changed, 54 insertions(+), 178 deletions(-) diff --git a/examples/grpo_trainer/run_qwen2-7b_seq_balance.sh b/examples/grpo_trainer/run_qwen2-7b_seq_balance.sh index fdc1ef606d7..f4ca9a41d7e 100644 --- a/examples/grpo_trainer/run_qwen2-7b_seq_balance.sh +++ b/examples/grpo_trainer/run_qwen2-7b_seq_balance.sh @@ -3,7 +3,7 @@ set -x # For async rollout mode, dataset should return raw chat. rollout_mode="async" -rollout_name="sglang" # sglang or vllm +rollout_name="vllm" # sglang or vllm if [ "$rollout_mode" = "async" ]; then export VLLM_USE_V1=1 return_raw_chat="True" @@ -19,7 +19,7 @@ python3 -m verl.trainer.main_ppo \ data.max_response_length=1024 \ data.filter_overlong_prompts=True \ data.truncation='error' \ - actor_rollout_ref.model.path=Qwen/Qwen2-7B-Instruct \ + actor_rollout_ref.model.path=Qwen/Qwen2.5-0.5B-Instruct \ actor_rollout_ref.actor.optim.lr=1e-6 \ actor_rollout_ref.model.use_remove_padding=True \ actor_rollout_ref.actor.ppo_mini_batch_size=256 \ @@ -41,7 +41,7 @@ python3 -m verl.trainer.main_ppo \ actor_rollout_ref.ref.fsdp_config.param_offload=True \ algorithm.use_kl_in_reward=False \ trainer.critic_warmup=0 \ - trainer.logger='["console","wandb"]' \ + trainer.logger='["console","tensorboard"]' \ trainer.project_name='verl_grpo_example_gsm8k' \ trainer.experiment_name='qwen2_7b_function_rm_kl1e-3' \ trainer.val_before_train=False \ diff --git a/recipe/fully_async_policy/agent_loop/agent_loop.py b/recipe/fully_async_policy/agent_loop/agent_loop.py index a13c4744e08..1b0b9218087 100644 --- a/recipe/fully_async_policy/agent_loop/agent_loop.py +++ b/recipe/fully_async_policy/agent_loop/agent_loop.py @@ -14,16 +14,26 @@ import asyncio import logging import os - -from recipe.fully_async_policy.vllm_rollout.vllm_async_server import vLLMReplicaForPartial -from verl.experimental.agent_loop.agent_loop import AgentLoopOutput, _agent_loop_registry, _DummyConfig +from typing import Optional, Any + +import hydra +import numpy as np +import ray +import torch +from omegaconf import DictConfig + +from recipe.fully_async_policy.vllm_rollout.vllm_async_server import FullyAsyncvLLMReplica +from verl.experimental.agent_loop.agent_loop import (AgentLoopOutput, _agent_loop_registry, _DummyConfig, + AsyncLLMServerManager, AgentLoopWorkerBase, BatchExecutor, + get_trajectory_info, AgentLoopManager) from verl.protocol import DataProto +from verl.single_controller.ray import RayWorkerGroup +from verl.utils.rollout_trace import rollout_trace_attr +from verl.workers.rollout.replica import TokenOutput logger = logging.getLogger(__file__) logger.setLevel(os.getenv("VERL_LOGGING_LEVEL", "WARN")) -from verl.experimental.agent_loop.agent_loop import * - class FullyAsyncLLMServerManager(AsyncLLMServerManager): async def generate_for_partial(self, request_id, prompt_ids, sampling_params) -> TokenOutput: @@ -37,61 +47,24 @@ async def generate_for_partial(self, request_id, prompt_ids, sampling_params) -> return output -class PartialAgentLoopOutput(AgentLoopOutput): +class FullyAsyncAgentLoopOutput(AgentLoopOutput): """Agent loop output.""" - is_cancel: bool = False """Indicates whether the request was interrupted""" log_probs: list[float] = None """Response token log probs including LLM generated token, tool response token.""" - + param_version_start: int = 0 + """Indicate start parameter version when this response is generated""" + param_version_end: int = 0 + """Indicate end parameter version when this response is generated, used for partial rollout""" @ray.remote class FullyAsyncAgentLoopWorker(AgentLoopWorkerBase): def __init__( self, config: DictConfig, server_handles: list[ray.actor.ActorHandle], rm_executor: BatchExecutor = None ): - """Initialize agent loop manager. - - Args: - config (DictConfig): YAML config. - server_handles (List[ray.actor.ActorHandle]): OpenAI compatible LLM server actor handles. - """ - self.config = config - - self.server_manager = FullyAsyncLLMServerManager(config, server_handles) - self.rm_executor = rm_executor - - model_path = config.actor_rollout_ref.model.path - self.model_name = "/".join(model_path.split("/")[-2:]) - local_path = copy_to_local(config.actor_rollout_ref.model.path) - self.tokenizer = hf_tokenizer(local_path, trust_remote_code=True) - self.processor = hf_processor(local_path, trust_remote_code=True) - - agent_loop_config_path = config.actor_rollout_ref.rollout.agent.agent_loop_config_path - if agent_loop_config_path: - agent_loop_configs = OmegaConf.load(agent_loop_config_path) - for agent_loop_config in agent_loop_configs: - _agent_loop_registry[agent_loop_config.name] = agent_loop_config - if self.config.actor_rollout_ref.model.get("custom_chat_template", None) is not None: - if self.processor is not None: - self.processor.chat_template = self.config.actor_rollout_ref.model.custom_chat_template - self.tokenizer.chat_template = self.config.actor_rollout_ref.model.custom_chat_template - - self.reward_manager_worker = RewardManagerWorker.options( - scheduling_strategy=ray.util.scheduling_strategies.NodeAffinitySchedulingStrategy( - node_id=ray.get_runtime_context().get_node_id(), - soft=False, - ), - ).remote(self.config, local_path, self.rm_executor) - - trace_config = self.config.actor_rollout_ref.rollout.get("trace", {}) - RolloutTraceConfig.init( - self.config.trainer.project_name, - self.config.trainer.experiment_name, - trace_config.get("backend"), - trace_config.get("token2text", False), - ) + self.server_manager_class = FullyAsyncLLMServerManager(config, server_handles) + super().__init__(config, server_handles, rm_executor) async def generate_sequences_no_post( self, batch: DataProto, partial_output_list: Optional[list[AgentLoopOutput]] @@ -103,13 +76,7 @@ async def generate_sequences_no_post( partial_output_list: Optional[List[AgentLoopOutput]]: already rollout result. Returns: - list[AgentLoopOutput]: List of agent loop outputs, one per sample in the batch. - Each AgentLoopOutput contains: - - prompt_ids: prompt token ids - - response_ids: response token ids including LLM generated and tool response tokens - - response_mask: 1 for LLM generated tokens, 0 for tool response tokens - - num_turns: number of chat turns - - metrics: performance metrics + list[FullyAsyncAgentLoopOutput]: List of agent loop outputs, one per sample in the batch. """ config = self.config.actor_rollout_ref.rollout sampling_params = dict( @@ -172,7 +139,7 @@ async def _partial_run_agent_loop( agent_loop = hydra.utils.instantiate( config=agent_loop_config, trainer_config=_DummyConfig(config=self.config), - server_manager=self.server_manager, + server_manager=self.server_manager_class, tokenizer=self.tokenizer, processor=self.processor, ) @@ -187,7 +154,7 @@ def __init__(self, config: DictConfig, worker_group: RayWorkerGroup = None, rm_w self.rm_executor = None self.rm_micro_batch_size = None self.agent_loop_workers_class = FullyAsyncAgentLoopWorker - self.rollout_replica_class = vLLMReplicaForPartial + self.rollout_replica_class = FullyAsyncvLLMReplica # 初始化其他必要属性为None,稍后在异步初始化中设置 self.rm_wg = rm_wg @@ -199,7 +166,6 @@ def __init__(self, config: DictConfig, worker_group: RayWorkerGroup = None, rm_w @classmethod async def create(cls, config: DictConfig, worker_group: RayWorkerGroup = None, rm_wg: RayWorkerGroup = None): """异步工厂方法来创建和初始化 PartialAgentLoopManager 实例""" - print("异步工厂方法来创建和初始化 PartialAgentLoopManager 实例") instance = cls(config, worker_group, rm_wg) await instance._async_init() return instance @@ -207,7 +173,6 @@ async def create(cls, config: DictConfig, worker_group: RayWorkerGroup = None, r async def _async_init(self): """异步初始化方法""" # 处理 rm_wg 相关初始化 - print("处理 rm_wg 相关初始化") if self.rm_wg: def batch_fn(data_list: list[DataProto]) -> list[torch.Tensor]: new_data_list = [] @@ -229,14 +194,8 @@ def batch_fn(data_list: list[DataProto]) -> list[torch.Tensor]: self.rm_micro_batch_size = self.rm_wg.world_size - # 初始化 LLM 服务器 - print("初始化 LLM 服务器") await self._initialize_llm_servers_async() - await self._init_agent_loop_workers_async() - - # 最初处于睡眠模式 - if self.config.actor_rollout_ref.rollout.free_cache_engine: - await self.sleep() + self._init_agent_loop_workers() async def _initialize_llm_servers_async(self): """异步初始化 LLM 服务器""" @@ -256,37 +215,16 @@ async def _initialize_llm_servers_async(self): ] if self.worker_group: - print("await asyncio.gather(*[server.init_hybrid(self.worker_group) for server in self.rollout_replicas])") await asyncio.gather(*[server.init_hybrid(self.worker_group) for server in self.rollout_replicas]) else: - print("asyncio.gather(*[server.init_standalone() for server in self.rollout_replicas])") await asyncio.gather(*[server.init_standalone() for server in self.rollout_replicas]) self.server_handles = [server._server_handle for server in self.rollout_replicas] self.server_addresses = [server._server_address for server in self.rollout_replicas] - async def _init_agent_loop_workers_async(self): - """异步初始化 agent loop workers""" - self.agent_loop_workers = [] - num_workers = self.config.actor_rollout_ref.rollout.agent.num_workers - - node_ids = [node["NodeID"] for node in ray.nodes() if node["Alive"] and node["Resources"].get("CPU", 0) > 0] - tasks = [] - for i in range(num_workers): - # Round-robin scheduling over the all nodes - node_id = node_ids[i % len(node_ids)] - worker = self.agent_loop_workers_class.options( - name=f"agent_loop_worker_{i}", - scheduling_strategy=ray.util.scheduling_strategies.NodeAffinitySchedulingStrategy( - node_id=node_id, soft=True - ), - ).remote(self.config, self.server_handles, self.rm_executor) - self.agent_loop_workers.append(worker) - async def generate_single_sample_async( self, sample: DataProto, - param_version: int, partial_output_list: Optional[list[AgentLoopOutput]], ) -> list[AgentLoopOutput]: """ @@ -294,16 +232,13 @@ async def generate_single_sample_async( Args: sample: 单个样本数据 - param_version: 参数版本 partial_output_list: Optional[List[AgentLoopOutput]]: 已经 rollout 的结果 Returns: list[AgentLoopOutput]: 处理结果列表 """ - # 使用负载均衡选择 worker worker = self._select_best_worker() - # 异步处理单个样本 - 使用无后处理版本获取原始AgentLoopOutput - output_future = worker.generate_sequences_no_post.remote(sample, param_version, partial_output_list) + output_future = worker.generate_sequences_no_post.remote(sample, partial_output_list) return await asyncio.wrap_future(output_future.future()) def _select_best_worker(self): @@ -326,59 +261,3 @@ async def wake_up(self): async def sleep(self): await asyncio.gather(*[replica.sleep() for replica in self.rollout_replicas]) - -# class PartialAgentLoopManager(AgentLoopManager): -# def __init__(self, config: DictConfig, worker_group: RayWorkerGroup = None, rm_wg: RayWorkerGroup = None): -# self.agent_loop_workers_class = FullyAsyncAgentLoopWorker -# self.rollout_replica_class = vLLMReplicaForPartial -# super().__init__(config, worker_group, rm_wg) -# -# async def generate_single_sample_async( -# self, -# sample: DataProto, -# param_version: int, -# partial_output_list: Optional[list[AgentLoopOutput]], -# ) -> list[AgentLoopOutput]: -# """ -# 异步处理单个样本, 需要复制n次 -# -# Args: -# sample: 单个样本数据 -# partial_output_list: Optional[List[AgentLoopOutput]]: already rollout result. -# -# Returns: -# tuple[AgentLoopOutput, float]: 处理结果和处理时间 -# """ -# # 使用负载均衡选择 worker -# worker = self._select_best_worker() -# # 异步处理单个样本 - 使用无后处理版本获取原始AgentLoopOutput -# output_future = worker.generate_sequences_no_post.remote(sample, param_version, partial_output_list) -# return await asyncio.wrap_future(output_future.future()) -# -# def _select_best_worker(self): -# """选择最佳的 worker(简单的轮询负载均衡)""" -# if not hasattr(self, "_worker_index"): -# self._worker_index = 0 -# -# worker = self.agent_loop_workers[self._worker_index] -# self._worker_index = (self._worker_index + 1) % len(self.agent_loop_workers) -# return worker -# -# def cancel(self): -# """Cancel all rollout tasks asynchronously.""" -# self._run_all([replica.cancel() for replica in self.rollout_replicas]) -# -# def resume(self): -# """Resume all rollout tasks asynchronously.""" -# self._run_all([replica.resume() for replica in self.rollout_replicas]) -# -# def _run_all(self, tasks: list[asyncio.Task]): -# async def run_all(): -# await asyncio.gather(*tasks) -# -# try: -# loop = asyncio.get_running_loop() -# future = asyncio.run_coroutine_threadsafe(run_all(), loop) -# future.result() -# except RuntimeError: -# asyncio.run(run_all()) diff --git a/recipe/fully_async_policy/agent_loop/partial_single_turn_agent_loop.py b/recipe/fully_async_policy/agent_loop/partial_single_turn_agent_loop.py index e4b2a7115bb..85b2cd24d96 100644 --- a/recipe/fully_async_policy/agent_loop/partial_single_turn_agent_loop.py +++ b/recipe/fully_async_policy/agent_loop/partial_single_turn_agent_loop.py @@ -17,11 +17,11 @@ from uuid import uuid4 from recipe.fully_async_policy.agent_loop.agent_loop import ( - AgentLoopBase, AgentLoopOutput, - PartialAgentLoopOutput, - register, + FullyAsyncAgentLoopOutput ) +from verl.experimental.agent_loop.agent_loop import register +from verl.experimental.agent_loop import AgentLoopBase from verl.utils.profiler import simple_timer logger = logging.getLogger(__file__) @@ -39,11 +39,16 @@ def __init__(self, *args, **kwargs): self.apply_chat_template_kwargs = self.config.data.get("apply_chat_template_kwargs", {}) async def run(self, sampling_params: dict[str, Any], **kwargs) -> AgentLoopOutput: - output: Optional[PartialAgentLoopOutput] = kwargs.get("output", None) + output: Optional[FullyAsyncAgentLoopOutput] = kwargs.get("output", None) messages = list(kwargs["raw_prompt"]) + param_version = kwargs.get("param_version", 0) metrics = {} request_id = uuid4().hex + + param_version_start = param_version + param_version_end = param_version + if not output: prompt_ids = await self.loop.run_in_executor( None, @@ -56,15 +61,14 @@ async def run(self, sampling_params: dict[str, Any], **kwargs) -> AgentLoopOutpu # 恢复暂停的样本,结果直接添加到 prompt_ids 后面 prompt_ids = output.prompt_ids + output.response_ids metrics["generate_sequences"] = output.metrics.generate_sequences + param_version_start = output.param_version_start else: # 同一批样本,部分cancel,部分没有cancel, 没有cancel的样本直接返回 return output - request_id = uuid4().hex with simple_timer("generate_sequences", metrics): response_ids, log_probs, is_cancel = await self.server_manager.generate_for_partial( request_id=request_id, prompt_ids=prompt_ids, sampling_params=sampling_params ) - if not output: response_mask = [1] * len(response_ids) # 暂停待恢复样本, 把输出结果加到 response_ids 后,并重置 response_mask @@ -74,7 +78,7 @@ async def run(self, sampling_params: dict[str, Any], **kwargs) -> AgentLoopOutpu response_ids = output.response_ids + response_ids response_mask = [1] * len(response_ids) - return PartialAgentLoopOutput( + return FullyAsyncAgentLoopOutput( prompt_ids=prompt_ids, response_ids=response_ids[: self.response_length], response_mask=response_mask[: self.response_length], @@ -82,4 +86,6 @@ async def run(self, sampling_params: dict[str, Any], **kwargs) -> AgentLoopOutpu metrics=metrics, is_cancel=is_cancel, log_probs=log_probs, + param_version_start=param_version_start, + param_version_end=param_version_end, ) diff --git a/recipe/fully_async_policy/detach_utils.py b/recipe/fully_async_policy/detach_utils.py index dcb22972e27..0f6a26ca13f 100644 --- a/recipe/fully_async_policy/detach_utils.py +++ b/recipe/fully_async_policy/detach_utils.py @@ -226,7 +226,6 @@ def merge_rollout_sample(config, tokenizer, rs: RolloutSample): rs.param_version_end = [agent_loop.param_version_end for agent_loop in rs.agent_loop_output_list] # 第四步,清空 agent_loop_output_list rs.agent_loop_output_list = [] - return rs diff --git a/recipe/fully_async_policy/fully_async_rollouter.py b/recipe/fully_async_policy/fully_async_rollouter.py index 82dc0bac008..40bf8eefa64 100644 --- a/recipe/fully_async_policy/fully_async_rollouter.py +++ b/recipe/fully_async_policy/fully_async_rollouter.py @@ -97,14 +97,14 @@ def __init__( self._create_dataloader(train_dataset, val_dataset, collate_fn, train_sampler) + # ==================== fully async config ==================== + self.total_rollout_steps = len(self.train_dataloader) * self.config.trainer.total_epochs if self.config.rollout.total_rollout_steps is not None: self.total_rollout_steps = min(self.config.rollout.total_rollout_steps, self.total_rollout_steps) print(f"[FullyAsyncRollouter] Total rollout steps: {self.total_rollout_steps}") self.total_train_steps = None - # ==================== fully async config ==================== - # Rollouter parameter configuration self.message_queue_client = None @@ -247,15 +247,10 @@ async def init_workers(self): 1. Ray resource pools from configuration 2. Worker groups for each role (actor, critic, etc.) """ - print("_init_resource_pools") self._init_resource_pools() - print("_create_worker_classes") self._create_worker_classes() - print("_init_worker_groups") self._init_worker_groups() - print("_init_models") self._init_models() - print("_init_async_rollout_manager") await self._init_async_rollout_manager() def _create_actor_rollout_classes(self): @@ -285,7 +280,6 @@ def _create_continuous_iterator(self): async def _init_async_rollout_manager(self): # create async rollout manager and request scheduler - print(f"_init_async_rollout_manager !!!!!!!!!!!!! {self.config.actor_rollout_ref.rollout.mode}") assert self.config.actor_rollout_ref.rollout.mode == "async" from recipe.fully_async_policy.agent_loop import PartialAgentLoopManager self.async_rollout_mode = True @@ -415,7 +409,7 @@ async def _process_single_sample_streaming(self, rollout_sample: RolloutSample): """流式处理单个样本""" # 调用异步生成方法 agent_loop_output_list = await self.async_rollout_manager.generate_single_sample_async( - rollout_sample.full_batch, self.current_param_version, rollout_sample.agent_loop_output_list + rollout_sample.full_batch, rollout_sample.agent_loop_output_list ) # 直接更新 RolloutSample 对象,填充剩余字段 rollout_sample.agent_loop_output_list = agent_loop_output_list diff --git a/recipe/fully_async_policy/fully_async_trainer.py b/recipe/fully_async_policy/fully_async_trainer.py index 4ff64d8f787..f13781c850e 100644 --- a/recipe/fully_async_policy/fully_async_trainer.py +++ b/recipe/fully_async_policy/fully_async_trainer.py @@ -83,6 +83,7 @@ def __init__( if self.config.algorithm.use_kl_in_reward: self.kl_ctrl_in_reward = core_algos.get_kl_controller(self.config.algorithm.kl_ctrl) + # ==================== fully async config ==================== self.message_queue_client = None self.param_synchronizer = None diff --git a/recipe/fully_async_policy/ray_trainer.py b/recipe/fully_async_policy/ray_trainer.py index d4e09e794d2..86992de4c13 100644 --- a/recipe/fully_async_policy/ray_trainer.py +++ b/recipe/fully_async_policy/ray_trainer.py @@ -61,15 +61,10 @@ def init_workers(self): 1. Ray resource pools from configuration 2. Worker groups for each role (actor, critic, etc.) """ - print("_init_resource_pools") self._init_resource_pools() - print("_create_worker_classes") self._create_worker_classes() - print("_init_worker_groups") self._init_worker_groups() - print("_init_models") self._init_models() - print("_init_async_rollout_manager") self._init_async_rollout_manager() def _init_resource_pools(self): diff --git a/recipe/fully_async_policy/vllm_rollout/vllm_async_server.py b/recipe/fully_async_policy/vllm_rollout/vllm_async_server.py index cbbea43e2ad..2faf6b89c0e 100644 --- a/recipe/fully_async_policy/vllm_rollout/vllm_async_server.py +++ b/recipe/fully_async_policy/vllm_rollout/vllm_async_server.py @@ -125,7 +125,7 @@ async def resume(self): self.paused = False -class vLLMReplicaForPartial(vLLMReplica): +class FullyAsyncvLLMReplica(vLLMReplica): def __init__(self, replica_rank: int, config: DictConfig, gpus_per_node: int = 8): super().__init__(replica_rank, config, gpus_per_node) self.server_class = vLLMHttpServerForPartial diff --git a/tests/special_e2e/run_fully_async_policy.sh b/tests/special_e2e/run_fully_async_policy.sh index 2ddc61910ba..fa7587204f1 100644 --- a/tests/special_e2e/run_fully_async_policy.sh +++ b/tests/special_e2e/run_fully_async_policy.sh @@ -118,7 +118,7 @@ common_params=( trainer.logger=['console'] trainer.project_name='verl-test-fully-async' trainer.experiment_name="${exp_name}" - trainer.val_before_train=False + trainer.val_before_train=True trainer.save_freq=-1 trainer.resume_mode=disable trainer.nnodes=1 diff --git a/verl/experimental/agent_loop/agent_loop.py b/verl/experimental/agent_loop/agent_loop.py index 11d541df17b..a6fdda210a0 100644 --- a/verl/experimental/agent_loop/agent_loop.py +++ b/verl/experimental/agent_loop/agent_loop.py @@ -364,7 +364,10 @@ def __init__( """ self.config = config - self.server_manager = AsyncLLMServerManager(config, server_handles) + # for recipe to change + if not hasattr(self, 'server_manager_class'): + self.server_manager_class = AsyncLLMServerManager(config, server_handles) + self.rm_executor = rm_executor model_path = config.actor_rollout_ref.model.path @@ -477,7 +480,7 @@ async def _run_agent_loop( agent_loop = hydra.utils.instantiate( config=agent_loop_config, trainer_config=_DummyConfig(config=self.config), - server_manager=self.server_manager, + server_manager=self.server_manager_class, tokenizer=self.tokenizer, processor=self.processor, ) diff --git a/verl/workers/rollout/replica.py b/verl/workers/rollout/replica.py index d673bb51cd7..5b289af4b7e 100644 --- a/verl/workers/rollout/replica.py +++ b/verl/workers/rollout/replica.py @@ -110,7 +110,6 @@ async def init_hybrid(self, worker_group: RayWorkerGroup): Args: worker_group: RayWorkerGroup, fused workers where training engine(fsdp/megatron) have been initialized. """ - print("=========== init_hybrid ============") self.rollout_mode = RolloutMode.HYBRID self.workers = worker_group.workers[ self.world_size * self.replica_rank : self.world_size * (self.replica_rank + 1) From 0e88084eb2dfb3e36e9fab53e0900053503a0576 Mon Sep 17 00:00:00 2001 From: ArronHZG Date: Thu, 18 Sep 2025 20:18:17 +0800 Subject: [PATCH 152/182] refactor 6 --- .../agent_loop/agent_loop.py | 50 ++++++++++------- .../partial_single_turn_agent_loop.py | 7 +-- recipe/fully_async_policy/detach_utils.py | 2 +- recipe/fully_async_policy/fsdp_workers.py | 13 +---- .../fully_async_rollouter.py | 5 +- .../fully_async_policy/fully_async_trainer.py | 5 +- .../vllm_rollout/vllm_async_server.py | 4 +- tests/special_e2e/run_fully_async_policy.sh | 4 +- verl/experimental/agent_loop/__init__.py | 2 +- verl/experimental/agent_loop/agent_loop.py | 7 +-- .../rollout/vllm_rollout/vllm_async_server.py | 56 +++++++++---------- .../rollout/vllm_rollout/vllm_rollout_spmd.py | 48 ++++++++-------- 12 files changed, 95 insertions(+), 108 deletions(-) diff --git a/recipe/fully_async_policy/agent_loop/agent_loop.py b/recipe/fully_async_policy/agent_loop/agent_loop.py index 1b0b9218087..d9cb2c9187e 100644 --- a/recipe/fully_async_policy/agent_loop/agent_loop.py +++ b/recipe/fully_async_policy/agent_loop/agent_loop.py @@ -14,7 +14,7 @@ import asyncio import logging import os -from typing import Optional, Any +from typing import Any, Optional import hydra import numpy as np @@ -23,9 +23,16 @@ from omegaconf import DictConfig from recipe.fully_async_policy.vllm_rollout.vllm_async_server import FullyAsyncvLLMReplica -from verl.experimental.agent_loop.agent_loop import (AgentLoopOutput, _agent_loop_registry, _DummyConfig, - AsyncLLMServerManager, AgentLoopWorkerBase, BatchExecutor, - get_trajectory_info, AgentLoopManager) +from verl.experimental.agent_loop.agent_loop import ( + AgentLoopManager, + AgentLoopOutput, + AgentLoopWorkerBase, + AsyncLLMServerManager, + BatchExecutor, + _agent_loop_registry, + _DummyConfig, + get_trajectory_info, +) from verl.protocol import DataProto from verl.single_controller.ray import RayWorkerGroup from verl.utils.rollout_trace import rollout_trace_attr @@ -49,6 +56,7 @@ async def generate_for_partial(self, request_id, prompt_ids, sampling_params) -> class FullyAsyncAgentLoopOutput(AgentLoopOutput): """Agent loop output.""" + is_cancel: bool = False """Indicates whether the request was interrupted""" log_probs: list[float] = None @@ -58,16 +66,17 @@ class FullyAsyncAgentLoopOutput(AgentLoopOutput): param_version_end: int = 0 """Indicate end parameter version when this response is generated, used for partial rollout""" + @ray.remote class FullyAsyncAgentLoopWorker(AgentLoopWorkerBase): def __init__( - self, config: DictConfig, server_handles: list[ray.actor.ActorHandle], rm_executor: BatchExecutor = None + self, config: DictConfig, server_handles: list[ray.actor.ActorHandle], rm_executor: BatchExecutor = None ): self.server_manager_class = FullyAsyncLLMServerManager(config, server_handles) super().__init__(config, server_handles, rm_executor) async def generate_sequences_no_post( - self, batch: DataProto, partial_output_list: Optional[list[AgentLoopOutput]] + self, batch: DataProto, partial_output_list: Optional[list[AgentLoopOutput]] ) -> list[AgentLoopOutput]: """Generate sequences from agent loop. @@ -117,19 +126,19 @@ async def generate_sequences_no_post( return await asyncio.gather(*tasks) async def _partial_run_agent_loop( - self, - sampling_params: dict[str, Any], - trajectory: dict[str, Any], - *, - agent_name: str, - **kwargs, + self, + sampling_params: dict[str, Any], + trajectory: dict[str, Any], + *, + agent_name: str, + **kwargs, ) -> AgentLoopOutput: with rollout_trace_attr( - step=trajectory["step"], - sample_index=trajectory["sample_index"], - rollout_n=trajectory["rollout_n"], - validate=trajectory["validate"], - name="agent_loop", + step=trajectory["step"], + sample_index=trajectory["sample_index"], + rollout_n=trajectory["rollout_n"], + validate=trajectory["validate"], + name="agent_loop", ): assert agent_name in _agent_loop_registry, ( f"Agent loop {agent_name} not registered, registered agent loops: {_agent_loop_registry.keys()}" @@ -174,6 +183,7 @@ async def _async_init(self): """异步初始化方法""" # 处理 rm_wg 相关初始化 if self.rm_wg: + def batch_fn(data_list: list[DataProto]) -> list[torch.Tensor]: new_data_list = [] for data in data_list: @@ -223,9 +233,9 @@ async def _initialize_llm_servers_async(self): self.server_addresses = [server._server_address for server in self.rollout_replicas] async def generate_single_sample_async( - self, - sample: DataProto, - partial_output_list: Optional[list[AgentLoopOutput]], + self, + sample: DataProto, + partial_output_list: Optional[list[AgentLoopOutput]], ) -> list[AgentLoopOutput]: """ 异步处理单个样本 diff --git a/recipe/fully_async_policy/agent_loop/partial_single_turn_agent_loop.py b/recipe/fully_async_policy/agent_loop/partial_single_turn_agent_loop.py index 85b2cd24d96..e7223eea894 100644 --- a/recipe/fully_async_policy/agent_loop/partial_single_turn_agent_loop.py +++ b/recipe/fully_async_policy/agent_loop/partial_single_turn_agent_loop.py @@ -16,12 +16,9 @@ from typing import Any, Optional from uuid import uuid4 -from recipe.fully_async_policy.agent_loop.agent_loop import ( - AgentLoopOutput, - FullyAsyncAgentLoopOutput -) -from verl.experimental.agent_loop.agent_loop import register +from recipe.fully_async_policy.agent_loop.agent_loop import AgentLoopOutput, FullyAsyncAgentLoopOutput from verl.experimental.agent_loop import AgentLoopBase +from verl.experimental.agent_loop.agent_loop import register from verl.utils.profiler import simple_timer logger = logging.getLogger(__file__) diff --git a/recipe/fully_async_policy/detach_utils.py b/recipe/fully_async_policy/detach_utils.py index 0f6a26ca13f..4225340e539 100644 --- a/recipe/fully_async_policy/detach_utils.py +++ b/recipe/fully_async_policy/detach_utils.py @@ -291,7 +291,7 @@ def assemble_batch_from_rollout_samples( } processing_time_stats = {f"fully_async/{key}": value for key, value in processing_time_stats.items()} - param_version_diff = [abs(a - b) for a, b in zip(rs.param_version_end, rs.param_version_start)] + param_version_diff = [abs(a - b) for a, b in zip(rs.param_version_end, rs.param_version_start, strict=False)] num_diff0 = param_version_diff.count(0) partial_stats = { "fully_async/partial/total_partial_num": len(param_version_diff) - num_diff0, diff --git a/recipe/fully_async_policy/fsdp_workers.py b/recipe/fully_async_policy/fsdp_workers.py index 241ce46272a..8471897fc83 100644 --- a/recipe/fully_async_policy/fsdp_workers.py +++ b/recipe/fully_async_policy/fsdp_workers.py @@ -18,29 +18,18 @@ import torch import torch.distributed -from omegaconf import DictConfig, OmegaConf -from torch.distributed.device_mesh import init_device_mesh +from omegaconf import DictConfig from torch.distributed.fsdp import FullyShardedDataParallel as FSDP -from transformers import AutoConfig -from verl.single_controller.base import Worker from verl.single_controller.base.decorator import Dispatch, register -from verl.utils import hf_processor, hf_tokenizer, omega_conf_to_dataclass -from verl.utils.debug import DistProfiler, DistProfilerExtension, log_gpu_memory_usage from verl.utils.device import ( get_device_name, - get_nccl_backend, get_torch_device, ) -from verl.utils.fs import copy_to_local from verl.utils.fsdp_utils import ( fsdp_version, ) -from verl.utils.import_utils import import_external_libs -from verl.utils.model import get_generation_config, update_model_config -from verl.workers.config import HFModelConfig, RolloutConfig from verl.workers.fsdp_workers import ActorRolloutRefWorker, AsyncActorRolloutRefWorker, CriticWorker -from verl.workers.rollout import get_rollout_class logger = logging.getLogger(__file__) logger.setLevel(os.getenv("VERL_LOGGING_LEVEL", "WARN")) diff --git a/recipe/fully_async_policy/fully_async_rollouter.py b/recipe/fully_async_policy/fully_async_rollouter.py index 40bf8eefa64..273f8348929 100644 --- a/recipe/fully_async_policy/fully_async_rollouter.py +++ b/recipe/fully_async_policy/fully_async_rollouter.py @@ -282,6 +282,7 @@ async def _init_async_rollout_manager(self): # create async rollout manager and request scheduler assert self.config.actor_rollout_ref.rollout.mode == "async" from recipe.fully_async_policy.agent_loop import PartialAgentLoopManager + self.async_rollout_mode = True print(f"{self.async_rollout_mode}") self.async_rollout_manager = await PartialAgentLoopManager.create( @@ -290,7 +291,6 @@ async def _init_async_rollout_manager(self): ) print(f"self.async_rollout_manager {self.async_rollout_manager}") - # 添加样本到待处理队列的协程 async def _feed_samples(self): continuous_iterator = self._create_continuous_iterator() @@ -604,8 +604,7 @@ async def _should_pause_generation(self) -> bool: return False async def pause(self): - """pause rollout - """ + """pause rollout""" print("[FullyAsyncRollouter][Public][Pause]") async with self.lock: self.paused = True diff --git a/recipe/fully_async_policy/fully_async_trainer.py b/recipe/fully_async_policy/fully_async_trainer.py index f13781c850e..a4c59c33701 100644 --- a/recipe/fully_async_policy/fully_async_trainer.py +++ b/recipe/fully_async_policy/fully_async_trainer.py @@ -13,7 +13,6 @@ # limitations under the License. import time -import warnings from datetime import datetime from pprint import pprint from typing import Any @@ -31,11 +30,9 @@ from recipe.fully_async_policy.ray_trainer import FullyAsyncRayPPOTrainer from verl.single_controller.ray import RayClassWithInitArgs, RayWorkerGroup from verl.trainer.ppo import core_algos -from verl.trainer.ppo.core_algos import AdvantageEstimator from verl.trainer.ppo.ray_trainer import ResourcePoolManager -from verl.trainer.ppo.utils import Role, WorkerType, need_reference_policy, need_reward_model, need_critic +from verl.trainer.ppo.utils import Role, WorkerType, need_critic, need_reference_policy, need_reward_model from verl.utils.debug import marked_timer -from verl.utils.tracking import ValidationGenerationsLogger @ray.remote(num_cpus=10) diff --git a/recipe/fully_async_policy/vllm_rollout/vllm_async_server.py b/recipe/fully_async_policy/vllm_rollout/vllm_async_server.py index 2faf6b89c0e..0831aebd5b4 100644 --- a/recipe/fully_async_policy/vllm_rollout/vllm_async_server.py +++ b/recipe/fully_async_policy/vllm_rollout/vllm_async_server.py @@ -25,8 +25,8 @@ from verl.workers.rollout.replica import RolloutMode from verl.workers.rollout.vllm_rollout.vllm_async_server import ( _qwen2_5_vl_dedup_image_tokens, - vLLMHttpServer, - vLLMReplica, vLLMHttpServerBase, + vLLMHttpServerBase, + vLLMReplica, ) logger = logging.getLogger(__file__) diff --git a/tests/special_e2e/run_fully_async_policy.sh b/tests/special_e2e/run_fully_async_policy.sh index fa7587204f1..096cb05c7a1 100644 --- a/tests/special_e2e/run_fully_async_policy.sh +++ b/tests/special_e2e/run_fully_async_policy.sh @@ -49,8 +49,8 @@ top_k=-1 val_top_p=0.7 # Fully async specific parameters -n_gpus_rollout=1 -n_gpus_training=1 +n_gpus_rollout=4 +n_gpus_training=4 train_prompt_bsz=0 gen_prompt_bsz=1 diff --git a/verl/experimental/agent_loop/__init__.py b/verl/experimental/agent_loop/__init__.py index 27b633e5055..fd3d2ca1b84 100644 --- a/verl/experimental/agent_loop/__init__.py +++ b/verl/experimental/agent_loop/__init__.py @@ -12,7 +12,7 @@ # See the License for the specific language governing permissions and # limitations under the License. -from .agent_loop import AgentLoopBase, AgentLoopManager, AsyncLLMServerManager, AgentLoopWorker +from .agent_loop import AgentLoopBase, AgentLoopManager, AgentLoopWorker, AsyncLLMServerManager from .single_turn_agent_loop import SingleTurnAgentLoop from .tool_agent_loop import ToolAgentLoop diff --git a/verl/experimental/agent_loop/agent_loop.py b/verl/experimental/agent_loop/agent_loop.py index a6fdda210a0..5183865ee64 100644 --- a/verl/experimental/agent_loop/agent_loop.py +++ b/verl/experimental/agent_loop/agent_loop.py @@ -365,7 +365,7 @@ def __init__( self.config = config # for recipe to change - if not hasattr(self, 'server_manager_class'): + if not hasattr(self, "server_manager_class"): self.server_manager_class = AsyncLLMServerManager(config, server_handles) self.rm_executor = rm_executor @@ -756,9 +756,9 @@ def batch_fn(data_list: list[DataProto]) -> list[torch.Tensor]: self.rm_micro_batch_size = rm_wg.world_size # for recipe to change - if not hasattr(self, 'rollout_replica_class'): + if not hasattr(self, "rollout_replica_class"): self.rollout_replica_class = get_rollout_replica_class(self.config.actor_rollout_ref.rollout.name) - if not hasattr(self, 'agent_loop_workers_class'): + if not hasattr(self, "agent_loop_workers_class"): self.agent_loop_workers_class = AgentLoopWorker self._initialize_llm_servers() @@ -790,7 +790,6 @@ def _initialize_llm_servers(self): self.server_handles = [server._server_handle for server in self.rollout_replicas] self.server_addresses = [server._server_address for server in self.rollout_replicas] - def _init_agent_loop_workers(self): self.agent_loop_workers = [] num_workers = self.config.actor_rollout_ref.rollout.agent.num_workers diff --git a/verl/workers/rollout/vllm_rollout/vllm_async_server.py b/verl/workers/rollout/vllm_rollout/vllm_async_server.py index e996df19247..a3b22765ba4 100644 --- a/verl/workers/rollout/vllm_rollout/vllm_async_server.py +++ b/verl/workers/rollout/vllm_rollout/vllm_async_server.py @@ -74,11 +74,11 @@ def _init_executor(self) -> None: self.collective_rpc("load_model") def collective_rpc( - self, - method: str | Callable, - timeout: Optional[float] = None, - args: tuple = (), - kwargs: Optional[dict[str, Any]] = None, + self, + method: str | Callable, + timeout: Optional[float] = None, + args: tuple = (), + kwargs: Optional[dict[str, Any]] = None, ) -> list[Any]: if isinstance(method, str): sent_method = method @@ -107,14 +107,14 @@ class vLLMHttpServerBase: """ def __init__( - self, - config: DictConfig, - rollout_mode: RolloutMode, - workers: list[ActorHandle], - replica_rank: int, - node_rank: int, - gpus_per_node: int, - nnodes: int, + self, + config: DictConfig, + rollout_mode: RolloutMode, + workers: list[ActorHandle], + replica_rank: int, + node_rank: int, + gpus_per_node: int, + nnodes: int, ): """ Args: @@ -244,7 +244,7 @@ async def launch_server(self, master_address: str = None, master_port: int = Non print( "=" * 1000, f"replica_rank={self.replica_rank}, node_rank={self.node_rank}, nnodes={self.nnodes}, " - f"get worker zmq addresses: {zmq_addresses}" + f"get worker zmq addresses: {zmq_addresses}", ) os.environ["VERL_VLLM_ZMQ_ADDRESSES"] = ",".join(zmq_addresses) @@ -273,11 +273,11 @@ async def run_server(self, args: argparse.Namespace): engine_client.shutdown = lambda: None async def generate( - self, - prompt_ids: list[int], - sampling_params: dict[str, Any], - request_id: str, - image_data: Optional[list[Any]] = None, + self, + prompt_ids: list[int], + sampling_params: dict[str, Any], + request_id: str, + image_data: Optional[list[Any]] = None, ) -> TokenOutput: """Generate sequence with token-in-token-out.""" # TODO(@wuxibin): switch to `/generate` http endpoint once multi-modal support ready. @@ -333,14 +333,14 @@ class vLLMHttpServer(vLLMHttpServerBase): """ def __init__( - self, - config: DictConfig, - rollout_mode: RolloutMode, - workers: list[ActorHandle], - replica_rank: int, - node_rank: int, - gpus_per_node: int, - nnodes: int, + self, + config: DictConfig, + rollout_mode: RolloutMode, + workers: list[ActorHandle], + replica_rank: int, + node_rank: int, + gpus_per_node: int, + nnodes: int, ): super().__init__(config, rollout_mode, workers, replica_rank, node_rank, gpus_per_node, nnodes) @@ -385,7 +385,7 @@ async def launch_servers(self): # create server actor in each node with node affinity for node_rank in range(nnodes): - workers = self.workers[node_rank * gpus_per_node: (node_rank + 1) * gpus_per_node] + workers = self.workers[node_rank * gpus_per_node : (node_rank + 1) * gpus_per_node] node_id = worker_node_ids[node_rank * gpus_per_node] server = self.server_class.options( scheduling_strategy=ray.util.scheduling_strategies.NodeAffinitySchedulingStrategy( diff --git a/verl/workers/rollout/vllm_rollout/vllm_rollout_spmd.py b/verl/workers/rollout/vllm_rollout/vllm_rollout_spmd.py index 5e084509aee..ff301dfb3ac 100644 --- a/verl/workers/rollout/vllm_rollout/vllm_rollout_spmd.py +++ b/verl/workers/rollout/vllm_rollout/vllm_rollout_spmd.py @@ -91,10 +91,10 @@ def _pre_process_inputs(pad_token_id, prompt_token_ids: torch.Tensor) -> list[in class vLLMRollout(BaseRollout): def __init__( - self, - config: RolloutConfig, - model_config: HFModelConfig, - device_mesh: DeviceMesh, + self, + config: RolloutConfig, + model_config: HFModelConfig, + device_mesh: DeviceMesh, ): super().__init__(config, model_config, device_mesh) @@ -125,11 +125,11 @@ def __init__( if hasattr(model_hf_config, "max_position_embeddings"): max_position_embeddings = model_hf_config.max_position_embeddings elif hasattr(model_hf_config, "llm_config") and hasattr( - model_hf_config.llm_config, "max_position_embeddings" + model_hf_config.llm_config, "max_position_embeddings" ): max_position_embeddings = model_hf_config.llm_config.max_position_embeddings elif hasattr(model_hf_config, "text_config") and hasattr( - model_hf_config.text_config, "max_position_embeddings" + model_hf_config.text_config, "max_position_embeddings" ): max_position_embeddings = model_hf_config.text_config.max_position_embeddings if max_position_embeddings is None: @@ -144,12 +144,12 @@ def __init__( rope_scaling_factor = rope_scaling_config.get("factor", 1.0) assert ( - model_hf_config.max_position_embeddings * rope_scaling_factor - >= config.prompt_length + config.response_length + model_hf_config.max_position_embeddings * rope_scaling_factor + >= config.prompt_length + config.response_length ), ( - "model context length should be greater than total sequence length, " - + f"got rope_scaling_factor={rope_scaling_factor} and " - + f"max_position_embeddings={model_hf_config.max_position_embeddings}" + "model context length should be greater than total sequence length, " + + f"got rope_scaling_factor={rope_scaling_factor} and " + + f"max_position_embeddings={model_hf_config.max_position_embeddings}" ) max_model_len = int(config.max_model_len or config.prompt_length + config.response_length) @@ -289,7 +289,7 @@ def generate_sequences(self, prompts: DataProto, **kwargs) -> DataProto: if "multi_modal_data" in non_tensor_batch: vllm_inputs = [] for raw_prompt_ids, multi_modal_data in zip( - non_tensor_batch.pop("raw_prompt_ids"), non_tensor_batch.pop("multi_modal_data"), strict=True + non_tensor_batch.pop("raw_prompt_ids"), non_tensor_batch.pop("multi_modal_data"), strict=True ): vllm_inputs.append({"prompt_token_ids": raw_prompt_ids, "multi_modal_data": multi_modal_data}) else: @@ -332,9 +332,8 @@ def generate_sequences(self, prompts: DataProto, **kwargs) -> DataProto: if len(lora_int_ids) > 0: lora_int_id = lora_int_ids[0] lora_requests = [ - LoRARequest(lora_name=f"{lora_int_id}", lora_int_id=lora_int_id, - lora_path="/simon-stub-path") - ] * batch_size + LoRARequest(lora_name=f"{lora_int_id}", lora_int_id=lora_int_id, lora_path="/simon-stub-path") + ] * batch_size # users can customize different sampling_params at different run with self.update_sampling_params(**kwargs): @@ -459,9 +458,9 @@ def _monkey_patch_compute_logits(model, vocab_size: int): original_compute_logits = model.compute_logits def compute_logits( - self, - hidden_states: torch.Tensor, - sampling_metadata: SamplingMetadata, + self, + hidden_states: torch.Tensor, + sampling_metadata: SamplingMetadata, ) -> torch.Tensor: logits = original_compute_logits(hidden_states, sampling_metadata) logits[..., vocab_size:] = float("-inf") @@ -474,10 +473,10 @@ class vLLMAsyncRollout(BaseRollout): """vLLMAsyncRollout is a thin wrapper of WorkerWrapperBase, which is engine in single worker process.""" def __init__( - self, - config: RolloutConfig, - model_config: HFModelConfig, - device_mesh: DeviceMesh, + self, + config: RolloutConfig, + model_config: HFModelConfig, + device_mesh: DeviceMesh, ): super().__init__(config, model_config, device_mesh) @@ -535,10 +534,7 @@ async def _loop_forever(self): def _init_worker(self, all_kwargs: list[dict[str, Any]]): """Initialize worker engine.""" - print("=" * 100, "\n", - "=" * 100, "\n", - "=" * 100, "\n", - "Initializing vLLMAsyncRollout...") + print("=" * 100, "\n", "=" * 100, "\n", "=" * 100, "\n", "Initializing vLLMAsyncRollout...") all_kwargs[0]["rank"] = int(os.environ["RANK"]) device_name = "NPU" if is_npu_available else "GPU" From a48ec884b82fb01161b21803b6e847318b9c835f Mon Sep 17 00:00:00 2001 From: ArronHZG Date: Thu, 18 Sep 2025 20:49:43 +0800 Subject: [PATCH 153/182] refactor 7 --- examples/grpo_trainer/run_qwen2-7b_seq_balance.sh | 6 +++--- recipe/fully_async_policy/agent_loop/__init__.py | 5 ++--- recipe/fully_async_policy/agent_loop/agent_loop.py | 4 ++-- recipe/fully_async_policy/fully_async_rollouter.py | 3 --- verl/experimental/agent_loop/agent_loop.py | 6 +++--- verl/workers/rollout/vllm_rollout/vllm_async_server.py | 5 ++--- verl/workers/rollout/vllm_rollout/vllm_rollout_spmd.py | 3 --- 7 files changed, 12 insertions(+), 20 deletions(-) diff --git a/examples/grpo_trainer/run_qwen2-7b_seq_balance.sh b/examples/grpo_trainer/run_qwen2-7b_seq_balance.sh index f4ca9a41d7e..fdc1ef606d7 100644 --- a/examples/grpo_trainer/run_qwen2-7b_seq_balance.sh +++ b/examples/grpo_trainer/run_qwen2-7b_seq_balance.sh @@ -3,7 +3,7 @@ set -x # For async rollout mode, dataset should return raw chat. rollout_mode="async" -rollout_name="vllm" # sglang or vllm +rollout_name="sglang" # sglang or vllm if [ "$rollout_mode" = "async" ]; then export VLLM_USE_V1=1 return_raw_chat="True" @@ -19,7 +19,7 @@ python3 -m verl.trainer.main_ppo \ data.max_response_length=1024 \ data.filter_overlong_prompts=True \ data.truncation='error' \ - actor_rollout_ref.model.path=Qwen/Qwen2.5-0.5B-Instruct \ + actor_rollout_ref.model.path=Qwen/Qwen2-7B-Instruct \ actor_rollout_ref.actor.optim.lr=1e-6 \ actor_rollout_ref.model.use_remove_padding=True \ actor_rollout_ref.actor.ppo_mini_batch_size=256 \ @@ -41,7 +41,7 @@ python3 -m verl.trainer.main_ppo \ actor_rollout_ref.ref.fsdp_config.param_offload=True \ algorithm.use_kl_in_reward=False \ trainer.critic_warmup=0 \ - trainer.logger='["console","tensorboard"]' \ + trainer.logger='["console","wandb"]' \ trainer.project_name='verl_grpo_example_gsm8k' \ trainer.experiment_name='qwen2_7b_function_rm_kl1e-3' \ trainer.val_before_train=False \ diff --git a/recipe/fully_async_policy/agent_loop/__init__.py b/recipe/fully_async_policy/agent_loop/__init__.py index 40dcd0ac7a3..773dab10572 100644 --- a/recipe/fully_async_policy/agent_loop/__init__.py +++ b/recipe/fully_async_policy/agent_loop/__init__.py @@ -13,8 +13,7 @@ # limitations under the License. from .partial_single_turn_agent_loop import PartialSingleTurnAgentLoop +from .agent_loop import PartialAgentLoopManager _ = [PartialSingleTurnAgentLoop] - - -from .agent_loop import PartialAgentLoopManager +__all__ = ["PartialAgentLoopManager"] diff --git a/recipe/fully_async_policy/agent_loop/agent_loop.py b/recipe/fully_async_policy/agent_loop/agent_loop.py index d9cb2c9187e..3fce8d65e60 100644 --- a/recipe/fully_async_policy/agent_loop/agent_loop.py +++ b/recipe/fully_async_policy/agent_loop/agent_loop.py @@ -72,7 +72,7 @@ class FullyAsyncAgentLoopWorker(AgentLoopWorkerBase): def __init__( self, config: DictConfig, server_handles: list[ray.actor.ActorHandle], rm_executor: BatchExecutor = None ): - self.server_manager_class = FullyAsyncLLMServerManager(config, server_handles) + self.server_manager = FullyAsyncLLMServerManager(config, server_handles) super().__init__(config, server_handles, rm_executor) async def generate_sequences_no_post( @@ -148,7 +148,7 @@ async def _partial_run_agent_loop( agent_loop = hydra.utils.instantiate( config=agent_loop_config, trainer_config=_DummyConfig(config=self.config), - server_manager=self.server_manager_class, + server_manager=self.server_manager, tokenizer=self.tokenizer, processor=self.processor, ) diff --git a/recipe/fully_async_policy/fully_async_rollouter.py b/recipe/fully_async_policy/fully_async_rollouter.py index 273f8348929..ca5e312a1a1 100644 --- a/recipe/fully_async_policy/fully_async_rollouter.py +++ b/recipe/fully_async_policy/fully_async_rollouter.py @@ -282,14 +282,11 @@ async def _init_async_rollout_manager(self): # create async rollout manager and request scheduler assert self.config.actor_rollout_ref.rollout.mode == "async" from recipe.fully_async_policy.agent_loop import PartialAgentLoopManager - self.async_rollout_mode = True - print(f"{self.async_rollout_mode}") self.async_rollout_manager = await PartialAgentLoopManager.create( config=self.config, worker_group=self.rollout_wg, ) - print(f"self.async_rollout_manager {self.async_rollout_manager}") # 添加样本到待处理队列的协程 async def _feed_samples(self): diff --git a/verl/experimental/agent_loop/agent_loop.py b/verl/experimental/agent_loop/agent_loop.py index 5183865ee64..ae56c2a187a 100644 --- a/verl/experimental/agent_loop/agent_loop.py +++ b/verl/experimental/agent_loop/agent_loop.py @@ -365,8 +365,8 @@ def __init__( self.config = config # for recipe to change - if not hasattr(self, "server_manager_class"): - self.server_manager_class = AsyncLLMServerManager(config, server_handles) + if not hasattr(self, "server_manager"): + self.server_manager = AsyncLLMServerManager(config, server_handles) self.rm_executor = rm_executor @@ -480,7 +480,7 @@ async def _run_agent_loop( agent_loop = hydra.utils.instantiate( config=agent_loop_config, trainer_config=_DummyConfig(config=self.config), - server_manager=self.server_manager_class, + server_manager=self.server_manager, tokenizer=self.tokenizer, processor=self.processor, ) diff --git a/verl/workers/rollout/vllm_rollout/vllm_async_server.py b/verl/workers/rollout/vllm_rollout/vllm_async_server.py index a3b22765ba4..75195009057 100644 --- a/verl/workers/rollout/vllm_rollout/vllm_async_server.py +++ b/verl/workers/rollout/vllm_rollout/vllm_async_server.py @@ -241,10 +241,9 @@ async def launch_server(self, master_address: str = None, master_port: int = Non server_args.distributed_executor_backend = distributed_executor_backend zmq_addresses = ray.get([worker.get_zeromq_address.remote() for worker in self.workers]) - print( - "=" * 1000, + logger.info( f"replica_rank={self.replica_rank}, node_rank={self.node_rank}, nnodes={self.nnodes}, " - f"get worker zmq addresses: {zmq_addresses}", + f"get worker zmq addresses: {zmq_addresses}" ) os.environ["VERL_VLLM_ZMQ_ADDRESSES"] = ",".join(zmq_addresses) diff --git a/verl/workers/rollout/vllm_rollout/vllm_rollout_spmd.py b/verl/workers/rollout/vllm_rollout/vllm_rollout_spmd.py index ff301dfb3ac..baef0c9315e 100644 --- a/verl/workers/rollout/vllm_rollout/vllm_rollout_spmd.py +++ b/verl/workers/rollout/vllm_rollout/vllm_rollout_spmd.py @@ -68,7 +68,6 @@ logger = logging.getLogger(__file__) logger.setLevel(os.getenv("VERL_LOGGING_LEVEL", "WARN")) - # TODO # 1. support pp in vllm # 2. passing tokenizer is not necessary? no encoding/decoding is happending here @@ -534,8 +533,6 @@ async def _loop_forever(self): def _init_worker(self, all_kwargs: list[dict[str, Any]]): """Initialize worker engine.""" - print("=" * 100, "\n", "=" * 100, "\n", "=" * 100, "\n", "Initializing vLLMAsyncRollout...") - all_kwargs[0]["rank"] = int(os.environ["RANK"]) device_name = "NPU" if is_npu_available else "GPU" all_kwargs[0]["local_rank"] = ( From e6819cdb180f516253386a45807066fc2c12aa52 Mon Sep 17 00:00:00 2001 From: ArronHZG Date: Thu, 18 Sep 2025 21:06:42 +0800 Subject: [PATCH 154/182] refactor 8 --- .../fully_async_policy/agent_loop/__init__.py | 4 +- .../agent_loop/agent_loop.py | 14 +- recipe/fully_async_policy/fsdp_workers.py | 2 +- .../fully_async_rollouter.py | 10 +- recipe/fully_async_policy/ray_trainer.py | 16 +- .../unittest/test_batch_utils.py | 344 ---------------- recipe/fully_async_policy/unittest/test_mq.py | 387 ------------------ .../vllm_rollout/vllm_async_server.py | 8 + verl/experimental/agent_loop/__init__.py | 2 +- 9 files changed, 25 insertions(+), 762 deletions(-) delete mode 100644 recipe/fully_async_policy/unittest/test_batch_utils.py delete mode 100644 recipe/fully_async_policy/unittest/test_mq.py diff --git a/recipe/fully_async_policy/agent_loop/__init__.py b/recipe/fully_async_policy/agent_loop/__init__.py index 773dab10572..e30d78f1a8a 100644 --- a/recipe/fully_async_policy/agent_loop/__init__.py +++ b/recipe/fully_async_policy/agent_loop/__init__.py @@ -12,8 +12,8 @@ # See the License for the specific language governing permissions and # limitations under the License. +from .agent_loop import FullyAsyncAgentLoopManager from .partial_single_turn_agent_loop import PartialSingleTurnAgentLoop -from .agent_loop import PartialAgentLoopManager _ = [PartialSingleTurnAgentLoop] -__all__ = ["PartialAgentLoopManager"] +__all__ = [FullyAsyncAgentLoopManager] diff --git a/recipe/fully_async_policy/agent_loop/agent_loop.py b/recipe/fully_async_policy/agent_loop/agent_loop.py index 3fce8d65e60..2ccfa712c54 100644 --- a/recipe/fully_async_policy/agent_loop/agent_loop.py +++ b/recipe/fully_async_policy/agent_loop/agent_loop.py @@ -155,9 +155,8 @@ async def _partial_run_agent_loop( return await agent_loop.run(sampling_params, **kwargs) -class PartialAgentLoopManager(AgentLoopManager): +class FullyAsyncAgentLoopManager(AgentLoopManager): def __init__(self, config: DictConfig, worker_group: RayWorkerGroup = None, rm_wg: RayWorkerGroup = None): - # 初始化基本属性,但不执行异步操作 self.config = config self.worker_group = worker_group self.rm_executor = None @@ -165,7 +164,6 @@ def __init__(self, config: DictConfig, worker_group: RayWorkerGroup = None, rm_w self.agent_loop_workers_class = FullyAsyncAgentLoopWorker self.rollout_replica_class = FullyAsyncvLLMReplica - # 初始化其他必要属性为None,稍后在异步初始化中设置 self.rm_wg = rm_wg self.rollout_replicas = None self.server_handles = None @@ -174,14 +172,11 @@ def __init__(self, config: DictConfig, worker_group: RayWorkerGroup = None, rm_w @classmethod async def create(cls, config: DictConfig, worker_group: RayWorkerGroup = None, rm_wg: RayWorkerGroup = None): - """异步工厂方法来创建和初始化 PartialAgentLoopManager 实例""" instance = cls(config, worker_group, rm_wg) await instance._async_init() return instance async def _async_init(self): - """异步初始化方法""" - # 处理 rm_wg 相关初始化 if self.rm_wg: def batch_fn(data_list: list[DataProto]) -> list[torch.Tensor]: @@ -208,7 +203,6 @@ def batch_fn(data_list: list[DataProto]) -> list[torch.Tensor]: self._init_agent_loop_workers() async def _initialize_llm_servers_async(self): - """异步初始化 LLM 服务器""" rollout_world_size = self.config.actor_rollout_ref.rollout.tensor_model_parallel_size world_size = ( self.worker_group.world_size @@ -239,11 +233,9 @@ async def generate_single_sample_async( ) -> list[AgentLoopOutput]: """ 异步处理单个样本 - Args: sample: 单个样本数据 partial_output_list: Optional[List[AgentLoopOutput]]: 已经 rollout 的结果 - Returns: list[AgentLoopOutput]: 处理结果列表 """ @@ -252,7 +244,6 @@ async def generate_single_sample_async( return await asyncio.wrap_future(output_future.future()) def _select_best_worker(self): - """选择最佳的 worker(简单的轮询负载均衡)""" if not hasattr(self, "_worker_index"): self._worker_index = 0 @@ -271,3 +262,6 @@ async def wake_up(self): async def sleep(self): await asyncio.gather(*[replica.sleep() for replica in self.rollout_replicas]) + + async def reset_prefix_cache(self): + await asyncio.gather(*[replica.reset_prefix_cache() for replica in self.rollout_replicas]) diff --git a/recipe/fully_async_policy/fsdp_workers.py b/recipe/fully_async_policy/fsdp_workers.py index 8471897fc83..ffe50941187 100644 --- a/recipe/fully_async_policy/fsdp_workers.py +++ b/recipe/fully_async_policy/fsdp_workers.py @@ -36,7 +36,7 @@ device_name = get_device_name() -__all__ = ["DetachActorWorker", "DetachRolloutWorker", "DetachAsyncRolloutWorker", "CriticWorker"] +__all__ = ["DetachActorWorker", "DetachAsyncRolloutWorker", "CriticWorker"] def get_inference_model(rollout): diff --git a/recipe/fully_async_policy/fully_async_rollouter.py b/recipe/fully_async_policy/fully_async_rollouter.py index ca5e312a1a1..d10c0684be4 100644 --- a/recipe/fully_async_policy/fully_async_rollouter.py +++ b/recipe/fully_async_policy/fully_async_rollouter.py @@ -281,9 +281,10 @@ def _create_continuous_iterator(self): async def _init_async_rollout_manager(self): # create async rollout manager and request scheduler assert self.config.actor_rollout_ref.rollout.mode == "async" - from recipe.fully_async_policy.agent_loop import PartialAgentLoopManager + from recipe.fully_async_policy.agent_loop import FullyAsyncAgentLoopManager + self.async_rollout_mode = True - self.async_rollout_manager = await PartialAgentLoopManager.create( + self.async_rollout_manager = await FullyAsyncAgentLoopManager.create( config=self.config, worker_group=self.rollout_wg, ) @@ -405,6 +406,9 @@ async def _processor_worker(self): async def _process_single_sample_streaming(self, rollout_sample: RolloutSample): """流式处理单个样本""" # 调用异步生成方法 + rollout_sample.full_batch.non_tensor_batch["param_version"] = [self.current_param_version] * len( + rollout_sample.full_batch + ) agent_loop_output_list = await self.async_rollout_manager.generate_single_sample_async( rollout_sample.full_batch, rollout_sample.agent_loop_output_list ) @@ -612,7 +616,7 @@ async def pause(self): await asyncio.gather(*self.active_tasks, return_exceptions=True) self.active_tasks.clear() print("[FullyAsyncRollouter][Public][Pause] All active tasks completed") - # TODO async_rollout_manager clear kv cache + await self.async_rollout_manager.reset_prefix_cache() self.monitor_loop_trigger = False async def resume(self): diff --git a/recipe/fully_async_policy/ray_trainer.py b/recipe/fully_async_policy/ray_trainer.py index 86992de4c13..b82d9fe0aae 100644 --- a/recipe/fully_async_policy/ray_trainer.py +++ b/recipe/fully_async_policy/ray_trainer.py @@ -38,7 +38,7 @@ compute_throughout_metrics, compute_timing_metrics, ) -from verl.trainer.ppo.ray_trainer import RayPPOTrainer, compute_advantage +from verl.trainer.ppo.ray_trainer import RayPPOTrainer, apply_kl_penalty, compute_advantage, compute_response_mask from verl.trainer.ppo.reward import compute_reward, compute_reward_async from verl.trainer.ppo.utils import Role from verl.utils.checkpoint.checkpoint_manager import should_save_ckpt_esi @@ -51,9 +51,6 @@ class FullyAsyncRayPPOTrainer(RayPPOTrainer): - def __init__(self, *args, **kwargs): - pass - def init_workers(self): """Initialize distributed training workers using Ray backend. @@ -161,16 +158,7 @@ def _init_models(self): self.actor_rollout_wg.init_model() def _init_async_rollout_manager(self): - # create async rollout manager and request scheduler - self.async_rollout_mode = False - if self.config.actor_rollout_ref.rollout.mode == "async": - from recipe.fully_async_policy.agent_loop.agent_loop import PartialAgentLoopManager - - self.async_rollout_mode = True - self.async_rollout_manager = PartialAgentLoopManager( - config=self.config, - worker_group=self.actor_rollout_wg, - ) + pass def fit(self): """ diff --git a/recipe/fully_async_policy/unittest/test_batch_utils.py b/recipe/fully_async_policy/unittest/test_batch_utils.py deleted file mode 100644 index 363423b589d..00000000000 --- a/recipe/fully_async_policy/unittest/test_batch_utils.py +++ /dev/null @@ -1,344 +0,0 @@ -#!/usr/bin/env python3 - -# Copyright 2025 Meituan Ltd. and/or its affiliates -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import os -import sys -import time -import unittest -from dataclasses import dataclass -from unittest.mock import MagicMock - -import numpy as np -import torch -from tensordict import TensorDict - -sys.path.append(os.path.join(os.path.dirname(__file__), "..", "..", "..")) - -from recipe.fully_async_policy.detach_utils import RolloutSample, assemble_batch_from_rollout_samples -from verl import DataProto - - -@dataclass -class MockAgentLoopMetrics: - """Mock AgentLoopMetrics for testing""" - - generate_sequences: float = 0.5 - tool_calls: float = 0.0 - - -@dataclass -class MockAgentLoopOutput: - """Mock AgentLoopOutput for testing""" - - prompt_ids: list[int] - response_ids: list[int] - response_mask: list[int] - num_turns: int = 1 - metrics: MockAgentLoopMetrics = None - - def __post_init__(self): - if self.metrics is None: - self.metrics = MockAgentLoopMetrics() - - -class MockConfig: - """Mock configuration object""" - - def __init__(self): - self.trainer = MockTrainerConfig() - - -class MockTrainerConfig: - """Mock trainer configuration""" - - def __init__(self): - self.balance_batch = False - - -class TestBatchUtils(unittest.TestCase): - def setUp(self): - """设置测试环境""" - self.tokenizer = MagicMock() - self.config = MockConfig() - - # Mock postprocess_agent_loop_outputs function - self.mock_postprocess = MagicMock() - - # Patch the postprocess function - import recipe.fully_async_policy.detach_utils as detach_utils_module - - self.original_postprocess = detach_utils_module.postprocess_agent_loop_outputs - detach_utils_module.postprocess_agent_loop_outputs = self.mock_postprocess - - # Mock compute_response_mask function - self.original_compute_response_mask = detach_utils_module.compute_response_mask - detach_utils_module.compute_response_mask = MagicMock(return_value=torch.ones(2, 128, dtype=torch.int64)) - - def tearDown(self): - """清理测试环境""" - import recipe.fully_async_policy.detach_utils as detach_utils_module - - detach_utils_module.postprocess_agent_loop_outputs = self.original_postprocess - detach_utils_module.compute_response_mask = self.original_compute_response_mask - - def create_mock_rollout_sample(self, sample_id: str, param_version: int = 1) -> RolloutSample: - """创建测试用的 RolloutSample""" - # 创建 mock AgentLoopOutput - agent_loop_output = MockAgentLoopOutput( - prompt_ids=torch.randint(0, 32000, (175,)).tolist(), - response_ids=torch.randint(0, 32000, (175,)).tolist(), - response_mask=[1] * 175, # 真实的response长度 - num_turns=2, - metrics=MockAgentLoopMetrics(generate_sequences=1.6468379497528076, tool_calls=0.0), - ) - - # 创建mock _gen_data - mock_gen_data = DataProto( - non_tensor_batch={ - "raw_prompt": np.array( - [ - [ - { - "content": "Tom receives a $12 allowance per month.", - "role": "user", - } - ] - ], - dtype=object, - ), - "tools_kwargs": np.array([{}], dtype=object), - "interaction_kwargs": np.array([{}], dtype=object), - "index": np.array([4570], dtype=object), - }, - meta_info={"global_steps": 1}, - ) - - return RolloutSample( - full_batch=mock_gen_data, - agent_loop_output_list=agent_loop_output, - sample_id=sample_id, - epoch=0, - rollout_n_index=0, - original_sample_index=0, - processing_time=1.6468379497528076, - generation_timestamp=time.time(), - param_version=param_version, - ) - - # def test_assemble_batch_empty_input(self): - # """测试空输入的情况""" - # with self.assertRaises(ValueError) as context: - # assemble_batch_from_rollout_samples([], self.tokenizer, self.config) - # - # self.assertIn("Empty rollout_samples", str(context.exception)) - # - # def test_assemble_batch_single_sample(self): - # """测试单个样本的批次组装""" - # # 设置mock返回值 - 使用正确的TensorDict格式 - # mock_gen_batch = DataProto( - # batch=TensorDict({ - # "input_ids": torch.randint(0, 1000, (1, 256)), - # "attention_mask": torch.ones(1, 256, dtype=torch.int64), - # "position_ids": torch.arange(256).unsqueeze(0), - # "prompts": torch.randint(0, 1000, (1, 128)), - # "responses": torch.randint(0, 1000, (1, 128)), - # "response_mask": torch.ones(1, 128, dtype=torch.int64), - # }, batch_size=1), - # non_tensor_batch={"__test_key": np.array(["test_value"], dtype=object)}, - # meta_info={"test_meta": "test_value"} - # ) - # self.mock_postprocess.return_value = mock_gen_batch - # - # # 创建测试样本 - # rollout_samples = [self.create_mock_rollout_sample("sample_1")] - # - # # 调用函数 - # result = assemble_batch_from_rollout_samples( - # rollout_samples=rollout_samples, - # tokenizer=self.tokenizer, - # config=self.config - # ) - # - # # 验证结果 - # self.assertIsInstance(result, DataProto) - # self.assertIn("uid", result.non_tensor_batch) - # self.assertEqual(result.non_tensor_batch["uid"][0], "uid_sample_1") - # - # # 验证meta_info包含预期字段 - # expected_fields = [ - # "rollout_param_versions", "sample_timestamps", "avg_processing_time", - # "max_processing_time", "param_version_diversity", "avg_sample_age", "assembly_time" - # ] - # for field in expected_fields: - # self.assertIn(field, result.meta_info) - # - # # 验证统计信息 - # self.assertEqual(result.meta_info["rollout_param_versions"], [1]) - # self.assertAlmostEqual(result.meta_info["avg_processing_time"], 1.6468379497528076, places=5) - # self.assertEqual(result.meta_info["param_version_diversity"], 1) - - def test_assemble_batch_multiple_samples(self): - """测试多个样本的批次组装""" - # 设置mock返回值 - 使用正确的TensorDict格式 - mock_gen_batch = DataProto( - batch=TensorDict( - { - "input_ids": torch.randint(0, 1000, (2, 256)), - "attention_mask": torch.ones(2, 256, dtype=torch.int64), - "position_ids": torch.arange(256).unsqueeze(0).repeat(2, 1), - "prompts": torch.randint(0, 1000, (2, 128)), - "responses": torch.randint(0, 1000, (2, 128)), - "response_mask": torch.ones(2, 128, dtype=torch.int64), - }, - batch_size=2, - ), - non_tensor_batch={"__test_key": np.array(["test_value1", "test_value2"], dtype=object)}, - meta_info={"test_meta": "test_value"}, - ) - self.mock_postprocess.return_value = mock_gen_batch - - # 创建测试样本 - rollout_samples = [ - self.create_mock_rollout_sample("sample_1", param_version=1), - self.create_mock_rollout_sample("sample_2", param_version=2), - ] - - print(rollout_samples) - - # 调用函数 - result = assemble_batch_from_rollout_samples( - rollout_samples=rollout_samples, tokenizer=self.tokenizer, config=self.config - ) - - # 验证结果 - self.assertIsInstance(result, DataProto) - self.assertEqual(len(result.non_tensor_batch["uid"]), 2) - self.assertListEqual(list(result.non_tensor_batch["uid"]), ["uid_sample_1", "uid_sample_2"]) - - # 验证多样本统计 - self.assertEqual(result.meta_info["rollout_param_versions"], [1, 2]) - self.assertEqual(result.meta_info["param_version_diversity"], 2) # 两个不同版本 - self.assertAlmostEqual(result.meta_info["avg_processing_time"], 1.6468379497528076, places=5) - - # def test_assemble_batch_with_balance_batch_flag(self): - # """测试启用balance_batch标志的情况""" - # # 设置mock返回值 - 使用正确的TensorDict格式 - # mock_gen_batch = DataProto( - # batch=TensorDict({ - # "input_ids": torch.randint(0, 1000, (1, 256)), - # "attention_mask": torch.ones(1, 256, dtype=torch.int64), - # "position_ids": torch.arange(256).unsqueeze(0), - # "prompts": torch.randint(0, 1000, (1, 128)), - # "responses": torch.randint(0, 1000, (1, 128)), - # "response_mask": torch.ones(1, 128, dtype=torch.int64), - # }, batch_size=1), - # non_tensor_batch={"__test_key": np.array(["test_value"], dtype=object)}, - # meta_info={"test_meta": "test_value"} - # ) - # self.mock_postprocess.return_value = mock_gen_batch - # - # # 设置config启用balance_batch - # self.config.trainer.balance_batch = True - # - # # 创建测试样本 - # rollout_samples = [self.create_mock_rollout_sample("sample_1")] - # - # # 调用函数 - # result = assemble_batch_from_rollout_samples( - # rollout_samples=rollout_samples, - # tokenizer=self.tokenizer, - # config=self.config, - # balance_batch=True - # ) - # - # # 验证结果(主要验证没有抛出异常) - # self.assertIsInstance(result, DataProto) - # - # def test_assemble_batch_attention_mask_processing(self): - # """测试attention_mask处理逻辑""" - # # 设置mock返回值 - 使用正确的TensorDict格式 - # mock_gen_batch = DataProto( - # batch=TensorDict({ - # "input_ids": torch.randint(0, 1000, (2, 256)), - # "attention_mask": torch.ones(2, 256, dtype=torch.int64), - # "position_ids": torch.arange(256).unsqueeze(0).repeat(2, 1), - # "prompts": torch.randint(0, 1000, (2, 128)), - # "responses": torch.randint(0, 1000, (2, 128)), - # "response_mask": torch.ones(2, 128, dtype=torch.int64), - # }, batch_size=2), - # non_tensor_batch={"__test_key": np.array(["test_value1", "test_value2"], dtype=object)}, - # meta_info={"test_meta": "test_value"} - # ) - # self.mock_postprocess.return_value = mock_gen_batch - # - # # 创建测试样本 - # rollout_samples = [ - # self.create_mock_rollout_sample("sample_1"), - # self.create_mock_rollout_sample("sample_2"), - # ] - # - # # 调用函数 - # result = assemble_batch_from_rollout_samples( - # rollout_samples=rollout_samples, - # tokenizer=self.tokenizer, - # config=self.config - # ) - # - # # 验证global_token_num被正确计算 - # self.assertIn("global_token_num", result.meta_info) - # self.assertIsInstance(result.meta_info["global_token_num"], list) - # - # def test_mock_postprocess_called_correctly(self): - # """测试postprocess_agent_loop_outputs被正确调用""" - # # 设置mock返回值 - 使用正确的TensorDict格式 - # mock_gen_batch = DataProto( - # batch=TensorDict({ - # "input_ids": torch.randint(0, 1000, (1, 256)), - # "attention_mask": torch.ones(1, 256, dtype=torch.int64), - # "position_ids": torch.arange(256).unsqueeze(0), - # "prompts": torch.randint(0, 1000, (1, 128)), - # "responses": torch.randint(0, 1000, (1, 128)), - # "response_mask": torch.ones(1, 128, dtype=torch.int64), - # }, batch_size=1), - # non_tensor_batch={"__test_key": np.array(["test_value"], dtype=object)}, - # meta_info={"test_meta": "test_value"} - # ) - # self.mock_postprocess.return_value = mock_gen_batch - # - # # 创建测试样本 - # rollout_samples = [self.create_mock_rollout_sample("sample_1")] - # - # # 调用函数 - # result = assemble_batch_from_rollout_samples( - # rollout_samples=rollout_samples, - # tokenizer=self.tokenizer, - # config=self.config - # ) - # - # # 验证postprocess_agent_loop_outputs被调用 - # self.mock_postprocess.assert_called_once() - # call_args = self.mock_postprocess.call_args - # - # # 验证调用参数 - # agent_loop_outputs, tokenizer, config = call_args[0] - # self.assertEqual(len(agent_loop_outputs), 1) - # self.assertEqual(tokenizer, self.tokenizer) - # self.assertEqual(config, self.config) - # - - -if __name__ == "__main__": - unittest.main() diff --git a/recipe/fully_async_policy/unittest/test_mq.py b/recipe/fully_async_policy/unittest/test_mq.py deleted file mode 100644 index 7af4945f311..00000000000 --- a/recipe/fully_async_policy/unittest/test_mq.py +++ /dev/null @@ -1,387 +0,0 @@ -# Copyright 2025 Meituan Ltd. and/or its affiliates -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import threading -import time -from unittest.mock import Mock - -import pytest -import ray -from omegaconf import DictConfig - -from recipe.fully_async_policy.message_queue import MessageQueue, MessageQueueClient - - -@pytest.fixture -def mock_sample(): - """Mock sample data object""" - return Mock() - - -@pytest.fixture -def basic_config(): - """Basic configuration""" - return DictConfig({"async_training": {"staleness_threshold": 3}}) - - -@pytest.fixture -def queue_config(): - """Queue configuration with different staleness threshold""" - return DictConfig({"async_training": {"staleness_threshold": 2}}) - - -@pytest.fixture -def ray_setup(): - """Setup Ray environment""" - if not ray.is_initialized(): - ray.init(local_mode=True, ignore_reinit_error=True) - yield - ray.shutdown() - - -@pytest.fixture -def message_queue_client(ray_setup, basic_config): - """Create MessageQueue actor and return its client""" - actor = MessageQueue.remote(basic_config, max_queue_size=10) - client = MessageQueueClient(actor) - yield client - client.shutdown() - - -class TestMessageQueue: - """Test MessageQueue (through MessageQueueClient)""" - - def test_put_sample_success(self, message_queue_client, mock_sample): - """Test successfully putting a sample""" - result = message_queue_client.put_sample(sample=mock_sample, param_version=1) - assert result is True - - # Check queue size - queue_size = message_queue_client.get_queue_size() - assert queue_size == 1 - - # Check statistics - stats = message_queue_client.get_statistics() - assert stats["total_produced"] == 1 - assert stats["queue_size"] == 1 - - def test_put_multiple_samples(self, message_queue_client, mock_sample): - """Test putting multiple samples""" - for i in range(3): - result = message_queue_client.put_sample(sample=mock_sample, param_version=1) - assert result is True - - # Check queue size - queue_size = message_queue_client.get_queue_size() - assert queue_size == 3 - - # Check statistics - stats = message_queue_client.get_statistics() - assert stats["total_produced"] == 3 - assert stats["queue_size"] == 3 - - def test_put_sample_staleness_check(self, message_queue_client, mock_sample): - """Test freshness check when putting samples""" - # Update parameter version to 5 - message_queue_client.update_param_version(5) - - # Try to put a stale sample (version difference >= 3 will be rejected) - result = message_queue_client.put_sample( - sample=mock_sample, - param_version=2, # 5-2=3, reaches threshold - ) - - assert result is False - - # Check dropped samples count in statistics - stats = message_queue_client.get_statistics() - assert stats["dropped_samples"] == 1 - - def test_put_sample_queue_overflow(self, message_queue_client, mock_sample): - """Test queue overflow handling""" - # Fill the queue (max capacity 10) - for i in range(12): # Put 12 samples, exceeding max capacity 10 - message_queue_client.put_sample(sample=mock_sample, param_version=1) - - # Queue size should stay at maximum value - queue_size = message_queue_client.get_queue_size() - assert queue_size == 10 - - # Check statistics - stats = message_queue_client.get_statistics() - assert stats["dropped_samples"] == 2 # 2 samples should be dropped - - def test_get_samples_success(self, message_queue_client, mock_sample): - """Test successfully getting samples""" - # First put some samples - for i in range(3): - message_queue_client.put_sample(sample=mock_sample, param_version=1) - - # Get 2 samples - retrieved_samples = message_queue_client.get_samples(min_batch_count=2) - - assert retrieved_samples is not None - assert len(retrieved_samples) == 2 - - # Check queue size decreased - queue_size = message_queue_client.get_queue_size() - assert queue_size == 1 - - # Check statistics - stats = message_queue_client.get_statistics() - assert stats["total_consumed"] == 2 - - def test_get_samples_blocking_behavior(self, message_queue_client, mock_sample): - """Test blocking behavior""" - result = [] - - def get_samples(): - # This will block until enough samples are available - samples = message_queue_client.get_samples(min_batch_count=2) - result.append(samples) - - def put_samples_later(): - time.sleep(0.5) # Delay putting samples - message_queue_client.put_sample(sample=mock_sample, param_version=1) - message_queue_client.put_sample(sample=mock_sample, param_version=1) - - # Start consumer thread - consumer_thread = threading.Thread(target=get_samples) - producer_thread = threading.Thread(target=put_samples_later) - - consumer_thread.start() - producer_thread.start() - - # Wait for both threads to complete - producer_thread.join(timeout=2) - consumer_thread.join(timeout=2) - - assert len(result) == 1 - assert len(result[0]) == 2 - - def test_update_param_version(self, message_queue_client): - """Test updating parameter version""" - message_queue_client.update_param_version(10) - stats = message_queue_client.get_statistics() - assert stats["current_param_version"] == 10 - - def test_clear_queue(self, message_queue_client, mock_sample): - """Test clearing the queue""" - # First add some samples - for i in range(3): - message_queue_client.put_sample(sample=mock_sample, param_version=1) - - # Clear the queue - message_queue_client.clear_queue() - - # Check queue size - queue_size = message_queue_client.get_queue_size() - assert queue_size == 0 - - def test_get_queue_size(self, message_queue_client, mock_sample): - """Test getting queue size""" - assert message_queue_client.get_queue_size() == 0 - - message_queue_client.put_sample(sample=mock_sample, param_version=1) - assert message_queue_client.get_queue_size() == 1 - - def test_get_statistics(self, message_queue_client): - """Test getting statistics""" - stats = message_queue_client.get_statistics() - - expected_keys = { - "queue_size", - "total_produced", - "total_consumed", - "dropped_samples", - "current_param_version", - "staleness_threshold", - "max_queue_size", - } - assert set(stats.keys()) == expected_keys - assert isinstance(stats["queue_size"], int) - assert isinstance(stats["total_produced"], int) - assert isinstance(stats["total_consumed"], int) - - def test_get_memory_usage(self, message_queue_client, mock_sample): - """Test getting memory usage statistics""" - # Add some samples - for i in range(2): - message_queue_client.put_sample(sample=mock_sample, param_version=1) - - memory_stats = message_queue_client.get_memory_usage() - - expected_keys = {"queue_samples", "estimated_memory_bytes", "estimated_memory_mb"} - assert set(memory_stats.keys()) == expected_keys - assert memory_stats["queue_samples"] == 2 - assert memory_stats["estimated_memory_bytes"] > 0 - assert memory_stats["estimated_memory_mb"] > 0 - - def test_shutdown(self, ray_setup, basic_config): - """Test shutdown functionality""" - # Create new actor for testing shutdown - actor = MessageQueue.remote(basic_config, max_queue_size=10) - client = MessageQueueClient(actor) - - # Shutdown should not throw exceptions - client.shutdown() - - -class TestConcurrency: - """Test concurrent scenarios""" - - def setup_method(self): - """Setup before each test method""" - if not ray.is_initialized(): - ray.init(local_mode=True, ignore_reinit_error=True) - - def teardown_method(self): - """Cleanup after each test method""" - if ray.is_initialized(): - ray.shutdown() - - def create_message_queue_client(self, config=None): - """Helper method to create MessageQueue client""" - if config is None: - config = DictConfig({"async_training": {"staleness_threshold": 3}}) - actor = MessageQueue.remote(config, max_queue_size=10) - return MessageQueueClient(actor) - - def test_concurrent_put_get(self, mock_sample): - """Test concurrent put and get""" - client = self.create_message_queue_client() - try: - results = [] - - def producer(): - for i in range(50): - samples = [mock_sample, mock_sample] - result = client.put_sample(sample=samples, param_version=1, rollout_metadata=None) - results.append(("put", result)) - time.sleep(0.1) - - def consumer(): - for _ in range(100): - try: - retrieved_samples = client.get_samples(min_batch_count=1) - results.append(("get", len(retrieved_samples) > 0)) - except Exception as e: - print(e) - results.append(("get", False)) - time.sleep(0.1) - - # Start producer and consumer threads - producer_thread = threading.Thread(target=producer) - consumer_thread = threading.Thread(target=consumer) - - producer_thread.start() - time.sleep(0.05) - consumer_thread.start() - - producer_thread.join(timeout=5) - consumer_thread.join(timeout=5) - - # Check results - put_results = [r[1] for r in results if r[0] == "put"] - get_results = [r[1] for r in results if r[0] == "get"] - - assert all(put_results) - assert all(get_results) - finally: - client.shutdown() - - def test_consume_first_produce_later(self, message_queue_client, mock_data_proto): - """Test consume first, produce later scenario - verify blocking and wake-up mechanism""" - consumer_result = [] - producer_result = [] - - def consumer_task(): - """Consumer task: start first, wait for producer to generate data""" - # Record the start time of consumption - consumer_start = time.time() - # This will block until at least 3 samples are available - samples = message_queue_client.get_samples(min_batch_count=3) - consumer_end = time.time() - consumer_result.append( - { - "success": True, - "samples_count": len(samples), - "wait_time": consumer_end - consumer_start, - "samples": samples, - } - ) - - def producer_task(): - """Producer task: start producing after a delay""" - time.sleep(4.0) - producer_start = time.time() - message_queue_client.put_sample( - sample=mock_data_proto, - param_version=1, - ) - time.sleep(1) - message_queue_client.put_sample( - sample=mock_data_proto, - param_version=1, - ) - time.sleep(1) - message_queue_client.put_sample( - sample=mock_data_proto, - param_version=1, - ) - producer_end = time.time() - producer_result.append( - { - "put_count": 3, - "produce_time": producer_end - producer_start, - } - ) - - print("produce finish") - - # Start consumer thread (first) - consumer_thread = threading.Thread(target=consumer_task, name="Consumer") - time.sleep(3) - # Start producer thread (later) - producer_thread = threading.Thread(target=producer_task, name="Producer") - - consumer_thread.start() - time.sleep(0.1) - producer_thread.start() - - print("=========", flush=True) - - producer_thread.join() - print("producer_result", producer_result, flush=True) - consumer_thread.join() - print("consumer_result", consumer_result, flush=True) - - assert len(consumer_result) == 1, "消费者应该执行一次" - - consumer_data = consumer_result[0] - producer_data = producer_result[0] - - assert producer_data["put_count"] == 3 - assert consumer_data["samples_count"] == 3 - - final_queue_size = message_queue_client.get_queue_size() - assert final_queue_size == 0 - - stats = message_queue_client.get_statistics() - assert stats["total_produced"] == 3 - assert stats["total_consumed"] == 3 - - -if __name__ == "__main__": - pytest.main([__file__, "-v", "--tb=short"]) diff --git a/recipe/fully_async_policy/vllm_rollout/vllm_async_server.py b/recipe/fully_async_policy/vllm_rollout/vllm_async_server.py index 0831aebd5b4..3bcd3e0a959 100644 --- a/recipe/fully_async_policy/vllm_rollout/vllm_async_server.py +++ b/recipe/fully_async_policy/vllm_rollout/vllm_async_server.py @@ -124,6 +124,10 @@ async def resume(self): async with self.lock: self.paused = False + async def reset_prefix_cache(self): + async with self.lock: + await self.engine.reset_prefix_cache() + class FullyAsyncvLLMReplica(vLLMReplica): def __init__(self, replica_rank: int, config: DictConfig, gpus_per_node: int = 8): @@ -137,3 +141,7 @@ async def cancel(self): async def resume(self): """Resume each rollout server.""" await asyncio.gather(*[server.resume.remote() for server in self.servers]) + + async def reset_prefix_cache(self): + """reset kv cache in each rollout server.""" + await asyncio.gather(*[server.reset_prefix_cache.remote() for server in self.servers]) diff --git a/verl/experimental/agent_loop/__init__.py b/verl/experimental/agent_loop/__init__.py index fd3d2ca1b84..d43683df3e4 100644 --- a/verl/experimental/agent_loop/__init__.py +++ b/verl/experimental/agent_loop/__init__.py @@ -18,4 +18,4 @@ _ = [SingleTurnAgentLoop, ToolAgentLoop] -# __all__ = ["AgentLoopBase", "AgentLoopManager", "AsyncLLMServerManager", "AgentLoopWorker"] +__all__ = ["AgentLoopBase", "AgentLoopManager", "AsyncLLMServerManager", "AgentLoopWorker"] From 8f62a942ab086273cfd6c4663682b54abc7fea7e Mon Sep 17 00:00:00 2001 From: ArronHZG Date: Thu, 18 Sep 2025 21:07:23 +0800 Subject: [PATCH 155/182] refactor 8 --- .../unittest/simple_streaming_demo.py | 100 +++++++++--------- 1 file changed, 50 insertions(+), 50 deletions(-) diff --git a/recipe/fully_async_policy/unittest/simple_streaming_demo.py b/recipe/fully_async_policy/unittest/simple_streaming_demo.py index d3ae0702e3f..209c2aae39b 100644 --- a/recipe/fully_async_policy/unittest/simple_streaming_demo.py +++ b/recipe/fully_async_policy/unittest/simple_streaming_demo.py @@ -18,7 +18,7 @@ class SimpleStreamingSystem: - """简化的流式处理系统演示""" + """Simplified streaming system demonstration""" def __init__(self, max_concurrent_tasks: int = 4): self.max_concurrent_tasks = max_concurrent_tasks @@ -26,148 +26,148 @@ def __init__(self, max_concurrent_tasks: int = 4): self.result_queue = asyncio.Queue() self.consumer_count = 0 - # 数据流协程 + # Data stream coroutine async def data_stream(self): - # 添加初始数据 - # 准备测试数据 - test_data = [{"id": f"task_{i}", "content": f"数据_{i}"} for i in range(8)] + # Add initial data + # Prepare test data + test_data = [{"id": f"task_{i}", "content": f"data_{i}"} for i in range(8)] await self.add_data_stream(test_data) - # 模拟后续数据流 + # Simulate subsequent data stream await asyncio.sleep(3) - print("\n添加第二批数据...") - extra_data = [{"id": f"extra_{i}", "content": f"额外数据_{i}"} for i in range(5)] + print("\nAdding second batch of data...") + extra_data = [{"id": f"extra_{i}", "content": f"extra_data_{i}"} for i in range(5)] await self.add_data_stream(extra_data) - # 发送结束信号 + # Send termination signal await asyncio.sleep(1) await self.data_queue.put("DONE") - print("发送结束信号") + print("Sending termination signal") async def add_data_stream(self, data_list: list[dict]): - """模拟数据流""" - print("开始添加数据流...") + """Simulate data stream""" + print("Starting to add data stream...") for i, data_item in enumerate(data_list): await self.data_queue.put(data_item) - print(f"数据 {data_item['id']} 进入待处理队列") + print(f"Data {data_item['id']} added to pending queue") - # 模拟数据流的间隔 - if i < len(data_list) - 1: # 最后一个不等待 + # Simulate interval between data streams + if i < len(data_list) - 1: # Don't wait after the last item await asyncio.sleep(0.8) - print("初始数据流添加完成") + print("Initial data stream added successfully") async def _process_data_async(self, data_item: dict): - """异步处理单个数据项""" + """Asynchronously process a single data item""" data_id = data_item["id"] content = data_item["content"] - # 模拟不同的处理时间(1-3秒) + # Simulate different processing times (1-3 seconds) processing_time = random.uniform(1, 3) - print(f" 开始处理 {data_id},预计耗时 {processing_time:.1f}s") + print(f" Starting to process {data_id}, estimated time {processing_time:.1f}s") - # 异步等待处理完成 + # Asynchronously wait for processing completion await asyncio.sleep(processing_time) result = { "id": data_id, - "processed_content": f"处理后的{content}", + "processed_content": f"Processed {content}", "processing_time": round(processing_time, 2), "completed_at": time.time(), } - # 立即放入结果队列 + # Immediately put into result queue await self.result_queue.put(result) - print(f" {data_id} 处理完成!(耗时 {processing_time:.1f}s) -> 进入结果队列") + print(f" {data_id} processing completed! (took {processing_time:.1f}s) -> Added to result queue") async def _submit_worker(self): - """流式提交工作协程""" + """Stream submission worker coroutine""" active_tasks = set() - print("流式提交器启动...") + print("Stream submitter started...") while True: - # 获取待处理数据 + # Get data to process data_item = await self.data_queue.get() if data_item == "DONE": - print("收到结束信号,等待剩余任务完成...") + print("Received termination signal, waiting for remaining tasks to complete...") if active_tasks: await asyncio.gather(*active_tasks, return_exceptions=True) break - # 检查并发数限制 + # Check concurrent limit while len(active_tasks) >= self.max_concurrent_tasks: - print(f"达到最大并发数 {self.max_concurrent_tasks},等待任务完成...") + print(f"Reached maximum concurrency {self.max_concurrent_tasks}, waiting for tasks to complete...") done_tasks, active_tasks = await asyncio.wait(active_tasks, return_when=asyncio.FIRST_COMPLETED) - # 清理完成的任务 + # Clean up completed tasks for task in done_tasks: try: await task - print(f"task 完成 {task}") + print(f"Task completed {task}") except Exception as e: - print(f"任务执行失败: {e}") + print(f"Task execution failed: {e}") - # 立即提交新任务 + # Immediately submit new task task = asyncio.create_task(self._process_data_async(data_item), name=f"active {data_item}") active_tasks.add(task) - print(f"提交任务 {data_item['id']},当前并发数: {len(active_tasks)}") + print(f"Submitted task {data_item['id']}, current concurrency: {len(active_tasks)}") async def _consumer_worker(self): - """结果消费协程""" - print("消费者启动...") + """Result consumer coroutine""" + print("Consumer started...") while True: try: - # 从结果队列获取处理结果 + # Get processing result from result queue result = await asyncio.wait_for(self.result_queue.get(), timeout=2.0) self.consumer_count += 1 print( - f"消费 #{self.consumer_count}: {result['id']} " - f"(处理时间 {result['processing_time']}s) - {result['processed_content']}" + f"Consumed #{self.consumer_count}: {result['id']} " + f"(processing time {result['processing_time']}s) - {result['processed_content']}" ) except asyncio.TimeoutError: - print(" 消费者等待中...") + print(" Consumer waiting...") await asyncio.sleep(0.5) async def run_demo(self): - """运行演示""" + """Run demonstration""" print("=" * 60) - print(f"最大并发数: {self.max_concurrent_tasks}") + print(f"Maximum concurrency: {self.max_concurrent_tasks}") print("=" * 60) - # 启动核心协程 + # Start core coroutines stream_task = asyncio.create_task(self.data_stream()) submit_task = asyncio.create_task(self._submit_worker()) consumer_task = asyncio.create_task(self._consumer_worker()) try: - # 等待数据流完成 + # Wait for data stream to complete await stream_task - print("数据流完成") + print("Data stream completed") - # 等待处理完成 + # Wait for processing to complete await submit_task - print("所有任务处理完成") + print("All tasks processed") finally: - # 清理 + # Cleanup submit_task.cancel() consumer_task.cancel() await asyncio.gather(submit_task, consumer_task, return_exceptions=True) - print(f"\n最终统计: 消费了 {self.consumer_count} 个结果") + print(f"\nFinal statistics: Consumed {self.consumer_count} results") async def main(): - """主函数""" + """Main function""" system = SimpleStreamingSystem(max_concurrent_tasks=3) await system.run_demo() From 26849432bc2328b3fea7b15d93d937a21af7cc49 Mon Sep 17 00:00:00 2001 From: ArronHZG Date: Thu, 18 Sep 2025 21:52:25 +0800 Subject: [PATCH 156/182] refactor 10 --- .../exp/qwen2-32B_128/fsdp2_colocate/run.sh | 133 -------------- .../fsdp2_colocate/runtime_env.yaml | 5 - .../fsdp2_fully-async_64-64/run.sh | 153 ---------------- .../fsdp2_fully-async_64-64/runtime_env.yaml | 4 - .../fsdp2_fully-async_80-48/run.sh | 153 ---------------- .../fsdp2_fully-async_80-48/runtime_env.yaml | 4 - .../fsdp2_fully-async_96-32/run.sh | 153 ---------------- .../fsdp2_fully-async_96-32/runtime_env.yaml | 4 - .../qwen2-7B-math_128/fsdp2_colocate/run.sh | 133 -------------- .../fsdp2_colocate/runtime_env.yaml | 3 - .../fsdp2_fully-async_64-64/runtime_env.yaml | 4 - .../runtime_env.yaml | 4 - .../megatron_colocate/run.sh | 135 -------------- .../megatron_colocate/runtime_env.yaml | 3 - .../qwen2-7B-math_32/fsdp2_colocate/run.sh | 133 -------------- .../fsdp2_colocate/runtime_env.yaml | 3 - .../fsdp2_fully-async_16-16/run.sh | 154 ---------------- .../fsdp2_fully-async_16-16/runtime_env.yaml | 4 - .../fsdp2_fully-async_24-8/run.sh | 168 ------------------ .../fsdp2_fully-async_24-8/runtime_env.yaml | 5 - .../fsdp2_fully-async_8-24/run.sh | 168 ------------------ .../fsdp2_fully-async_8-24/runtime_env.yaml | 5 - .../qwen2-7B-math_32/megatron_colocate/run.sh | 135 -------------- .../megatron_colocate/runtime_env.yaml | 3 - .../qwen2-7B-math_64/fsdp2_colocate/run.sh | 133 -------------- .../fsdp2_colocate/runtime_env.yaml | 3 - .../fsdp2_fully-async_24-40/run.sh | 168 ------------------ .../fsdp2_fully-async_24-40/runtime_env.yaml | 5 - .../fsdp2_fully-async_32-32/run.sh | 168 ------------------ .../fsdp2_fully-async_32-32/runtime_env.yaml | 4 - .../fsdp2_fully-async_40-24/run.sh | 168 ------------------ .../fsdp2_fully-async_40-24/runtime_env.yaml | 5 - .../qwen2-7B-math_64/megatron_colocate/run.sh | 135 -------------- .../megatron_colocate/runtime_env.yaml | 3 - .../qwen3-30BA3B_128/fsdp2_colocate/run.sh | 125 ------------- .../fsdp2_colocate/runtime_env.yaml | 3 - .../qwen3-30BA3B_128/megatron_colocate/run.sh | 161 ----------------- .../megatron_colocate/runtime_env.yaml | 5 - .../exp/qwen3-32B_128/fsdp2_colocate/run.sh | 125 ------------- .../fsdp2_colocate/runtime_env.yaml | 3 - .../qwen3-32B_128/megatron_colocate/run.sh | 156 ---------------- .../megatron_colocate/runtime_env.yaml | 5 - recipe/fully_async_policy/fully_async_main.py | 6 +- .../fully_async_rollouter.py | 6 +- .../fully_async_policy/fully_async_trainer.py | 5 +- .../run.sh => shell/dapo-32B_fsdp2_64_64.sh} | 20 +-- .../shell/dapo_7b_math_fsdp2_4_12.sh | 10 +- ...fsdp2_2_6.sh => dapo_7b_math_fsdp2_4_4.sh} | 14 +- .../dapo_7b_math_fsdp2_64_64.sh} | 10 +- .../shell/dapo_7b_math_fsdp2_8_8.sh | 22 +-- .../shell/dapo_7b_math_fsdp2_colocate.sh | 141 --------------- .../shell/dapo_7b_math_fsdp2_server.sh | 148 --------------- .../shell/dapo_7b_math_megatron_colocate.sh | 142 --------------- .../fully_async_policy/shell/runtime_env.yaml | 1 - recipe/one_step_off_policy/fsdp_workers.py | 1 - .../rollout/vllm_rollout/vllm_rollout_spmd.py | 1 - 56 files changed, 43 insertions(+), 3528 deletions(-) delete mode 100644 recipe/fully_async_policy/exp/qwen2-32B_128/fsdp2_colocate/run.sh delete mode 100644 recipe/fully_async_policy/exp/qwen2-32B_128/fsdp2_colocate/runtime_env.yaml delete mode 100644 recipe/fully_async_policy/exp/qwen2-32B_128/fsdp2_fully-async_64-64/run.sh delete mode 100644 recipe/fully_async_policy/exp/qwen2-32B_128/fsdp2_fully-async_64-64/runtime_env.yaml delete mode 100644 recipe/fully_async_policy/exp/qwen2-32B_128/fsdp2_fully-async_80-48/run.sh delete mode 100644 recipe/fully_async_policy/exp/qwen2-32B_128/fsdp2_fully-async_80-48/runtime_env.yaml delete mode 100644 recipe/fully_async_policy/exp/qwen2-32B_128/fsdp2_fully-async_96-32/run.sh delete mode 100644 recipe/fully_async_policy/exp/qwen2-32B_128/fsdp2_fully-async_96-32/runtime_env.yaml delete mode 100644 recipe/fully_async_policy/exp/qwen2-7B-math_128/fsdp2_colocate/run.sh delete mode 100644 recipe/fully_async_policy/exp/qwen2-7B-math_128/fsdp2_colocate/runtime_env.yaml delete mode 100644 recipe/fully_async_policy/exp/qwen2-7B-math_128/fsdp2_fully-async_64-64/runtime_env.yaml delete mode 100644 recipe/fully_async_policy/exp/qwen2-7B-math_128/fsdp2_fully-async_64-64_stal0.1/runtime_env.yaml delete mode 100644 recipe/fully_async_policy/exp/qwen2-7B-math_128/megatron_colocate/run.sh delete mode 100644 recipe/fully_async_policy/exp/qwen2-7B-math_128/megatron_colocate/runtime_env.yaml delete mode 100644 recipe/fully_async_policy/exp/qwen2-7B-math_32/fsdp2_colocate/run.sh delete mode 100644 recipe/fully_async_policy/exp/qwen2-7B-math_32/fsdp2_colocate/runtime_env.yaml delete mode 100644 recipe/fully_async_policy/exp/qwen2-7B-math_32/fsdp2_fully-async_16-16/run.sh delete mode 100644 recipe/fully_async_policy/exp/qwen2-7B-math_32/fsdp2_fully-async_16-16/runtime_env.yaml delete mode 100644 recipe/fully_async_policy/exp/qwen2-7B-math_32/fsdp2_fully-async_24-8/run.sh delete mode 100644 recipe/fully_async_policy/exp/qwen2-7B-math_32/fsdp2_fully-async_24-8/runtime_env.yaml delete mode 100644 recipe/fully_async_policy/exp/qwen2-7B-math_32/fsdp2_fully-async_8-24/run.sh delete mode 100644 recipe/fully_async_policy/exp/qwen2-7B-math_32/fsdp2_fully-async_8-24/runtime_env.yaml delete mode 100644 recipe/fully_async_policy/exp/qwen2-7B-math_32/megatron_colocate/run.sh delete mode 100644 recipe/fully_async_policy/exp/qwen2-7B-math_32/megatron_colocate/runtime_env.yaml delete mode 100644 recipe/fully_async_policy/exp/qwen2-7B-math_64/fsdp2_colocate/run.sh delete mode 100644 recipe/fully_async_policy/exp/qwen2-7B-math_64/fsdp2_colocate/runtime_env.yaml delete mode 100644 recipe/fully_async_policy/exp/qwen2-7B-math_64/fsdp2_fully-async_24-40/run.sh delete mode 100644 recipe/fully_async_policy/exp/qwen2-7B-math_64/fsdp2_fully-async_24-40/runtime_env.yaml delete mode 100644 recipe/fully_async_policy/exp/qwen2-7B-math_64/fsdp2_fully-async_32-32/run.sh delete mode 100644 recipe/fully_async_policy/exp/qwen2-7B-math_64/fsdp2_fully-async_32-32/runtime_env.yaml delete mode 100644 recipe/fully_async_policy/exp/qwen2-7B-math_64/fsdp2_fully-async_40-24/run.sh delete mode 100644 recipe/fully_async_policy/exp/qwen2-7B-math_64/fsdp2_fully-async_40-24/runtime_env.yaml delete mode 100644 recipe/fully_async_policy/exp/qwen2-7B-math_64/megatron_colocate/run.sh delete mode 100644 recipe/fully_async_policy/exp/qwen2-7B-math_64/megatron_colocate/runtime_env.yaml delete mode 100644 recipe/fully_async_policy/exp/qwen3-30BA3B_128/fsdp2_colocate/run.sh delete mode 100644 recipe/fully_async_policy/exp/qwen3-30BA3B_128/fsdp2_colocate/runtime_env.yaml delete mode 100644 recipe/fully_async_policy/exp/qwen3-30BA3B_128/megatron_colocate/run.sh delete mode 100644 recipe/fully_async_policy/exp/qwen3-30BA3B_128/megatron_colocate/runtime_env.yaml delete mode 100644 recipe/fully_async_policy/exp/qwen3-32B_128/fsdp2_colocate/run.sh delete mode 100644 recipe/fully_async_policy/exp/qwen3-32B_128/fsdp2_colocate/runtime_env.yaml delete mode 100644 recipe/fully_async_policy/exp/qwen3-32B_128/megatron_colocate/run.sh delete mode 100644 recipe/fully_async_policy/exp/qwen3-32B_128/megatron_colocate/runtime_env.yaml rename recipe/fully_async_policy/{exp/qwen2-7B-math_128/fsdp2_fully-async_64-64_stal0.1/run.sh => shell/dapo-32B_fsdp2_64_64.sh} (91%) rename recipe/fully_async_policy/shell/{dapo_7b_math_fsdp2_2_6.sh => dapo_7b_math_fsdp2_4_4.sh} (94%) rename recipe/fully_async_policy/{exp/qwen2-7B-math_128/fsdp2_fully-async_64-64/run.sh => shell/dapo_7b_math_fsdp2_64_64.sh} (93%) delete mode 100644 recipe/fully_async_policy/shell/dapo_7b_math_fsdp2_colocate.sh delete mode 100644 recipe/fully_async_policy/shell/dapo_7b_math_fsdp2_server.sh delete mode 100644 recipe/fully_async_policy/shell/dapo_7b_math_megatron_colocate.sh diff --git a/recipe/fully_async_policy/exp/qwen2-32B_128/fsdp2_colocate/run.sh b/recipe/fully_async_policy/exp/qwen2-32B_128/fsdp2_colocate/run.sh deleted file mode 100644 index 92203a7d87a..00000000000 --- a/recipe/fully_async_policy/exp/qwen2-32B_128/fsdp2_colocate/run.sh +++ /dev/null @@ -1,133 +0,0 @@ -#!/usr/bin/env bash -set -xeuo pipefail - -project_name='DAPO' -exp_name='dapo_qwen2-32B_20k_fsdp2_colocate_128' - -adv_estimator=grpo - -use_kl_in_reward=False -kl_coef=0.0 -use_kl_loss=False -kl_loss_coef=0.0 - -clip_ratio_low=0.2 -clip_ratio_high=0.28 - -max_prompt_length=$((1024 * 2)) -max_response_length=$((1024 * 20)) -enable_overlong_buffer=True -overlong_buffer_len=$((1024 * 4)) -overlong_penalty_factor=1.0 - -loss_agg_mode="token-mean" - -train_prompt_bsz=512 -n_resp_per_prompt=16 -train_prompt_mini_bsz=32 - -# Ray -# RAY_ADDRESS=${RAY_ADDRESS:-"http://localhost:8265"} -# WORKING_DIR=${WORKING_DIR:-"${PWD}"} -# RUNTIME_ENV=${RUNTIME_ENV:-"${WORKING_DIR}/verl/trainer/runtime_env.yaml"} -NNODES=${NNODES:-16} -NGPUS_PER_NODE=${NGPUS_PER_NODE:-8} -# Paths -RAY_DATA_HOME=${RAY_DATA_HOME:-"${HOME}/verl"} -# very important! please modify the max_position_embeddings in config.json to 32768 after downloading from huggingface - -MODEL_PATH=/mnt/dolphinfs/ssd_pool/docker/user/hadoop-friday-studio/.friday/models/Qwen2.5-32B -CKPTS_DIR=./ckpts/${project_name}/${exp_name} -TRAIN_FILE=/mnt/dolphinfs/ssd_pool/docker/user/hadoop-friday-studio/FTI/houzhenggang/data/dapo/dapo-math-17k.parquet -TEST_FILE=/mnt/dolphinfs/ssd_pool/docker/user/hadoop-friday-studio/FTI/houzhenggang/data/dapo/aime-2024.parquet -# Algorithm -temperature=1.0 -top_p=1.0 -top_k=-1 # 0 for HF rollout, -1 for vLLM rollout -val_top_p=0.7 - -# Performance Related Parameter -use_dynamic_bsz=True -actor_ppo_max_token_len=$(((max_prompt_length + max_response_length) * 2)) -infer_ppo_max_token_len=$(((max_prompt_length + max_response_length) * 3)) -offload=True -gen_tp=4 -sp_size=8 -fsdp_size=-1 - -# reference run wandb: https://wandb.ai/verl-org/DAPO%20Reproduction%20on%20verl/runs/ow47vvon?nw=nwusertongyuxuan361 - - -python -m verl.trainer.main_ppo \ - data.train_files="${TRAIN_FILE}" \ - data.val_files="${TEST_FILE}" \ - data.prompt_key=prompt \ - data.truncation='left' \ - data.max_prompt_length=${max_prompt_length} \ - data.max_response_length=${max_response_length} \ - data.train_batch_size=${train_prompt_bsz} \ - actor_rollout_ref.rollout.n=${n_resp_per_prompt} \ - algorithm.adv_estimator=${adv_estimator} \ - algorithm.use_kl_in_reward=${use_kl_in_reward} \ - algorithm.kl_ctrl.kl_coef=${kl_coef} \ - actor_rollout_ref.actor.strategy=fsdp2 \ - critic.strategy=fsdp2 \ - actor_rollout_ref.actor.use_kl_loss=${use_kl_loss} \ - actor_rollout_ref.actor.kl_loss_coef=${kl_loss_coef} \ - actor_rollout_ref.actor.clip_ratio_low=${clip_ratio_low} \ - actor_rollout_ref.actor.clip_ratio_high=${clip_ratio_high} \ - actor_rollout_ref.actor.clip_ratio_c=10.0 \ - actor_rollout_ref.model.use_remove_padding=True \ - +actor_rollout_ref.model.override_config.max_position_embeddings=32768 \ - actor_rollout_ref.actor.use_dynamic_bsz=${use_dynamic_bsz} \ - actor_rollout_ref.ref.log_prob_use_dynamic_bsz=${use_dynamic_bsz} \ - actor_rollout_ref.rollout.log_prob_use_dynamic_bsz=${use_dynamic_bsz} \ - actor_rollout_ref.actor.ppo_max_token_len_per_gpu=${actor_ppo_max_token_len} \ - actor_rollout_ref.ref.log_prob_max_token_len_per_gpu=${infer_ppo_max_token_len} \ - actor_rollout_ref.rollout.log_prob_max_token_len_per_gpu=${infer_ppo_max_token_len} \ - actor_rollout_ref.model.path="${MODEL_PATH}" \ - actor_rollout_ref.model.enable_gradient_checkpointing=True \ - actor_rollout_ref.actor.optim.lr=1e-6 \ - actor_rollout_ref.actor.optim.lr_warmup_steps=10 \ - actor_rollout_ref.actor.optim.weight_decay=0.1 \ - actor_rollout_ref.actor.ppo_mini_batch_size=${train_prompt_mini_bsz} \ - actor_rollout_ref.actor.fsdp_config.param_offload=${offload} \ - actor_rollout_ref.actor.fsdp_config.optimizer_offload=${offload} \ - actor_rollout_ref.actor.entropy_coeff=0 \ - actor_rollout_ref.actor.grad_clip=1.0 \ - actor_rollout_ref.actor.loss_agg_mode=${loss_agg_mode} \ - actor_rollout_ref.actor.ulysses_sequence_parallel_size=${sp_size} \ - actor_rollout_ref.rollout.gpu_memory_utilization=0.80 \ - actor_rollout_ref.rollout.tensor_model_parallel_size=${gen_tp} \ - actor_rollout_ref.rollout.enable_chunked_prefill=True \ - actor_rollout_ref.rollout.max_num_batched_tokens=$((max_prompt_length + max_response_length)) \ - actor_rollout_ref.rollout.temperature=${temperature} \ - actor_rollout_ref.rollout.top_p=${top_p} \ - actor_rollout_ref.rollout.top_k=${top_k} \ - actor_rollout_ref.rollout.val_kwargs.temperature=${temperature} \ - actor_rollout_ref.rollout.val_kwargs.top_p=${val_top_p} \ - actor_rollout_ref.rollout.val_kwargs.top_k=${top_k} \ - actor_rollout_ref.rollout.val_kwargs.do_sample=True \ - actor_rollout_ref.rollout.val_kwargs.n=1 \ - actor_rollout_ref.ref.fsdp_config.param_offload=${offload} \ - actor_rollout_ref.ref.ulysses_sequence_parallel_size=${sp_size} \ - actor_rollout_ref.actor.fsdp_config.fsdp_size=${fsdp_size} \ - reward_model.reward_manager=dapo \ - +reward_model.reward_kwargs.overlong_buffer_cfg.enable=${enable_overlong_buffer} \ - +reward_model.reward_kwargs.overlong_buffer_cfg.len=${overlong_buffer_len} \ - +reward_model.reward_kwargs.overlong_buffer_cfg.penalty_factor=${overlong_penalty_factor} \ - +reward_model.reward_kwargs.overlong_buffer_cfg.log=False \ - +reward_model.reward_kwargs.max_resp_len=${max_response_length} \ - trainer.logger=['console','tensorboard'] \ - trainer.project_name="${project_name}" \ - trainer.experiment_name="${exp_name}" \ - trainer.n_gpus_per_node="${NGPUS_PER_NODE}" \ - trainer.nnodes="${NNODES}" \ - trainer.val_before_train=True \ - trainer.test_freq=20 \ - trainer.save_freq=-1 \ - trainer.total_epochs=10 \ - trainer.total_training_steps=400 \ - trainer.default_local_dir="${CKPTS_DIR}" \ - trainer.resume_mode=auto \ - trainer.log_val_generations=10 diff --git a/recipe/fully_async_policy/exp/qwen2-32B_128/fsdp2_colocate/runtime_env.yaml b/recipe/fully_async_policy/exp/qwen2-32B_128/fsdp2_colocate/runtime_env.yaml deleted file mode 100644 index e33cfd681ca..00000000000 --- a/recipe/fully_async_policy/exp/qwen2-32B_128/fsdp2_colocate/runtime_env.yaml +++ /dev/null @@ -1,5 +0,0 @@ -env_vars: - TENSORBOARD_DIR: "/mnt/dolphinfs/ssd_pool/docker/user/hadoop-friday-studio/FTI/houzhenggang/expeirments/qwen2-32B-128/dapo_qwen2-32B_20k_fsdp2_colocate_128" - HYDRA_FULL_ERROR: "1" - TORCH_NCCL_AVOID_RECORD_STREAMS: "1" - VLLM_USE_V1: "1" \ No newline at end of file diff --git a/recipe/fully_async_policy/exp/qwen2-32B_128/fsdp2_fully-async_64-64/run.sh b/recipe/fully_async_policy/exp/qwen2-32B_128/fsdp2_fully-async_64-64/run.sh deleted file mode 100644 index 48be3ab3c84..00000000000 --- a/recipe/fully_async_policy/exp/qwen2-32B_128/fsdp2_fully-async_64-64/run.sh +++ /dev/null @@ -1,153 +0,0 @@ -#!/usr/bin/env bash -set -xeuo pipefail - -project_name='DAPO' -exp_name='dapo_qwen2-32B_20k_fsdp2_fully-async_64-64' - -# Paths -MODEL_PATH=/mnt/dolphinfs/ssd_pool/docker/user/hadoop-friday-studio/.friday/models/Qwen2.5-32B -CKPTS_DIR=./ckpts/${project_name}/${exp_name} -TRAIN_FILE=/mnt/dolphinfs/ssd_pool/docker/user/hadoop-friday-studio/FTI/houzhenggang/data/dapo/dapo-math-17k.parquet -TEST_FILE=/mnt/dolphinfs/ssd_pool/docker/user/hadoop-friday-studio/FTI/houzhenggang/data/dapo/aime-2024.parquet - -rollout_mode="async" -rollout_name="vllm" # sglang or vllm -if [ "$rollout_mode" = "async" ]; then - export VLLM_USE_V1=1 - return_raw_chat="True" -fi - -# Algorithm parameters -adv_estimator=grpo - -use_kl_in_reward=False -kl_coef=0.0 -use_kl_loss=False -kl_loss_coef=0.0 - -clip_ratio_low=0.2 -clip_ratio_high=0.28 - -# Response length parameters -max_prompt_length=$((1024 * 2)) -max_response_length=$((1024 * 20)) -enable_overlong_buffer=True -overlong_buffer_len=$((1024 * 4)) -overlong_penalty_factor=1.0 - -# Training parameters -loss_agg_mode="token-mean" - -# Algorithm -temperature=1.0 -top_p=1.0 -top_k=-1 # 0 for HF rollout, -1 for vLLM rollout -val_top_p=0.7 - -# Performance Related Parameter -use_dynamic_bsz=True -actor_ppo_max_token_len=$(((max_prompt_length + max_response_length) * 2)) -infer_ppo_max_token_len=$(((max_prompt_length + max_response_length) * 3)) -ref_offload=True -actor_offload=False -gen_tp=4 -sp_size=8 -fsdp_size=-1 - -# Fully async specific parameters -NNODES_ROLLOUT=${NNODES_ROLLOUT:-8} -NNODES_TRAIN=${NNODES_TRAIN:-8} -NGPUS_PER_NODE=${NGPUS_PER_NODE:-8} - -train_prompt_bsz=0 -gen_prompt_bsz=1 -n_resp_per_prompt=16 -train_prompt_mini_bsz=16 -total_rollout_steps=$(((512*200))) -test_freq=20 -staleness_threshold=0.1 -trigger_parameter_sync_step=8 -partial_rollout=True - -python -m recipe.fully_async_policy.fully_async_main \ - data.train_files="${TRAIN_FILE}" \ - data.val_files="${TEST_FILE}" \ - data.prompt_key=prompt \ - data.truncation='left' \ - data.max_prompt_length=${max_prompt_length} \ - data.max_response_length=${max_response_length} \ - data.train_batch_size=${train_prompt_bsz} \ - data.gen_batch_size=${gen_prompt_bsz} \ - data.return_raw_chat=${return_raw_chat} \ - actor_rollout_ref.rollout.n=${n_resp_per_prompt} \ - algorithm.adv_estimator=${adv_estimator} \ - algorithm.use_kl_in_reward=${use_kl_in_reward} \ - algorithm.kl_ctrl.kl_coef=${kl_coef} \ - actor_rollout_ref.actor.strategy=fsdp2 \ - critic.strategy=fsdp2 \ - actor_rollout_ref.actor.use_kl_loss=${use_kl_loss} \ - actor_rollout_ref.actor.kl_loss_coef=${kl_loss_coef} \ - actor_rollout_ref.actor.clip_ratio_low=${clip_ratio_low} \ - actor_rollout_ref.actor.clip_ratio_high=${clip_ratio_high} \ - actor_rollout_ref.actor.clip_ratio_c=10.0 \ - actor_rollout_ref.model.use_remove_padding=True \ - actor_rollout_ref.hybrid_engine=False \ - +actor_rollout_ref.model.override_config.max_position_embeddings=32768 \ - actor_rollout_ref.actor.use_dynamic_bsz=${use_dynamic_bsz} \ - actor_rollout_ref.ref.log_prob_use_dynamic_bsz=${use_dynamic_bsz} \ - actor_rollout_ref.rollout.log_prob_use_dynamic_bsz=${use_dynamic_bsz} \ - actor_rollout_ref.actor.ppo_max_token_len_per_gpu=${actor_ppo_max_token_len} \ - actor_rollout_ref.ref.log_prob_max_token_len_per_gpu=${infer_ppo_max_token_len} \ - actor_rollout_ref.rollout.log_prob_max_token_len_per_gpu=${infer_ppo_max_token_len} \ - actor_rollout_ref.model.path="${MODEL_PATH}" \ - actor_rollout_ref.actor.optim.lr=1e-6 \ - actor_rollout_ref.actor.optim.lr_warmup_steps=10 \ - actor_rollout_ref.actor.optim.weight_decay=0.1 \ - actor_rollout_ref.actor.ppo_mini_batch_size=${train_prompt_mini_bsz} \ - actor_rollout_ref.actor.fsdp_config.param_offload=${actor_offload} \ - actor_rollout_ref.actor.fsdp_config.optimizer_offload=${actor_offload} \ - actor_rollout_ref.actor.entropy_coeff=0 \ - actor_rollout_ref.actor.grad_clip=1.0 \ - actor_rollout_ref.actor.loss_agg_mode=${loss_agg_mode} \ - actor_rollout_ref.actor.ulysses_sequence_parallel_size=${sp_size} \ - actor_rollout_ref.rollout.gpu_memory_utilization=0.80 \ - actor_rollout_ref.rollout.tensor_model_parallel_size=${gen_tp} \ - actor_rollout_ref.rollout.enable_chunked_prefill=True \ - actor_rollout_ref.rollout.max_num_batched_tokens=$((max_prompt_length + max_response_length)) \ - actor_rollout_ref.rollout.temperature=${temperature} \ - actor_rollout_ref.rollout.top_p=${top_p} \ - actor_rollout_ref.rollout.top_k=${top_k} \ - actor_rollout_ref.rollout.val_kwargs.temperature=${temperature} \ - actor_rollout_ref.rollout.val_kwargs.top_p=${val_top_p} \ - actor_rollout_ref.rollout.val_kwargs.top_k=${top_k} \ - actor_rollout_ref.rollout.val_kwargs.do_sample=True \ - actor_rollout_ref.rollout.val_kwargs.n=1 \ - actor_rollout_ref.ref.fsdp_config.param_offload=${ref_offload} \ - actor_rollout_ref.ref.ulysses_sequence_parallel_size=${sp_size} \ - actor_rollout_ref.actor.fsdp_config.fsdp_size=${fsdp_size} \ - actor_rollout_ref.rollout.name=${rollout_name} \ - actor_rollout_ref.rollout.mode=${rollout_mode} \ - actor_rollout_ref.rollout.calculate_log_probs=True \ - reward_model.reward_manager=dapo \ - +reward_model.reward_kwargs.overlong_buffer_cfg.enable=${enable_overlong_buffer} \ - +reward_model.reward_kwargs.overlong_buffer_cfg.len=${overlong_buffer_len} \ - +reward_model.reward_kwargs.overlong_buffer_cfg.penalty_factor=${overlong_penalty_factor} \ - +reward_model.reward_kwargs.overlong_buffer_cfg.log=False \ - +reward_model.reward_kwargs.max_resp_len=${max_response_length} \ - trainer.logger=['console','tensorboard'] \ - trainer.project_name="${project_name}" \ - trainer.experiment_name="${exp_name}" \ - trainer.val_before_train=True \ - trainer.save_freq=-1 \ - trainer.default_local_dir="${CKPTS_DIR}" \ - trainer.resume_mode=auto \ - trainer.nnodes="${NNODES_TRAIN}" \ - trainer.n_gpus_per_node="${NGPUS_PER_NODE}" \ - rollout.nnodes="${NNODES_ROLLOUT}" \ - rollout.n_gpus_per_node="${NGPUS_PER_NODE}" \ - rollout.total_rollout_steps="${total_rollout_steps}" \ - rollout.total_epochs=10 \ - rollout.test_freq="${test_freq}" \ - async_training.staleness_threshold="${staleness_threshold}" \ - async_training.trigger_parameter_sync_step="${trigger_parameter_sync_step}" \ - async_training.partial_rollout="${partial_rollout}" diff --git a/recipe/fully_async_policy/exp/qwen2-32B_128/fsdp2_fully-async_64-64/runtime_env.yaml b/recipe/fully_async_policy/exp/qwen2-32B_128/fsdp2_fully-async_64-64/runtime_env.yaml deleted file mode 100644 index ea506be787e..00000000000 --- a/recipe/fully_async_policy/exp/qwen2-32B_128/fsdp2_fully-async_64-64/runtime_env.yaml +++ /dev/null @@ -1,4 +0,0 @@ -env_vars: - VLLM_USE_V1: "1" - TENSORBOARD_DIR: "/mnt/dolphinfs/ssd_pool/docker/user/hadoop-friday-studio/FTI/houzhenggang/expeirments/qwen2-32B-128/dapo_qwen2-32B_20k_fsdp2_fully-async_64-64-tps1" - HYDRA_FULL_ERROR: "1" \ No newline at end of file diff --git a/recipe/fully_async_policy/exp/qwen2-32B_128/fsdp2_fully-async_80-48/run.sh b/recipe/fully_async_policy/exp/qwen2-32B_128/fsdp2_fully-async_80-48/run.sh deleted file mode 100644 index fd2874d0f98..00000000000 --- a/recipe/fully_async_policy/exp/qwen2-32B_128/fsdp2_fully-async_80-48/run.sh +++ /dev/null @@ -1,153 +0,0 @@ -#!/usr/bin/env bash -set -xeuo pipefail - -project_name='DAPO' -exp_name='dapo_qwen2-32B_20k_fsdp2_fully-async_80-48-tps1' - -# Paths -MODEL_PATH=/mnt/dolphinfs/ssd_pool/docker/user/hadoop-friday-studio/.friday/models/Qwen2.5-32B -CKPTS_DIR=./ckpts/${project_name}/${exp_name} -TRAIN_FILE=/mnt/dolphinfs/ssd_pool/docker/user/hadoop-friday-studio/FTI/houzhenggang/data/dapo/dapo-math-17k.parquet -TEST_FILE=/mnt/dolphinfs/ssd_pool/docker/user/hadoop-friday-studio/FTI/houzhenggang/data/dapo/aime-2024.parquet - -rollout_mode="async" -rollout_name="vllm" # sglang or vllm -if [ "$rollout_mode" = "async" ]; then - export VLLM_USE_V1=1 - return_raw_chat="True" -fi - -# Algorithm parameters -adv_estimator=grpo - -use_kl_in_reward=False -kl_coef=0.0 -use_kl_loss=False -kl_loss_coef=0.0 - -clip_ratio_low=0.2 -clip_ratio_high=0.28 - -# Response length parameters -max_prompt_length=$((1024 * 2)) -max_response_length=$((1024 * 20)) -enable_overlong_buffer=True -overlong_buffer_len=$((1024 * 4)) -overlong_penalty_factor=1.0 - -# Training parameters -loss_agg_mode="token-mean" - -# Algorithm -temperature=1.0 -top_p=1.0 -top_k=-1 # 0 for HF rollout, -1 for vLLM rollout -val_top_p=0.7 - -# Performance Related Parameter -use_dynamic_bsz=True -actor_ppo_max_token_len=$(((max_prompt_length + max_response_length) * 2)) -infer_ppo_max_token_len=$(((max_prompt_length + max_response_length) * 3)) -ref_offload=True -actor_offload=False -gen_tp=4 -sp_size=8 -fsdp_size=-1 - -# Fully async specific parameters -NNODES_ROLLOUT=${NNODES_ROLLOUT:-10} -NNODES_TRAIN=${NNODES_TRAIN:-6} -NGPUS_PER_NODE=${NGPUS_PER_NODE:-8} - -train_prompt_bsz=0 -gen_prompt_bsz=1 -n_resp_per_prompt=16 -train_prompt_mini_bsz=128 -total_rollout_steps=$(((512*400))) -test_freq=20 -staleness_threshold=0.1 -trigger_parameter_sync_step=1 -partial_rollout=True - -python -m recipe.fully_async_policy.fully_async_main \ - data.train_files="${TRAIN_FILE}" \ - data.val_files="${TEST_FILE}" \ - data.prompt_key=prompt \ - data.truncation='left' \ - data.max_prompt_length=${max_prompt_length} \ - data.max_response_length=${max_response_length} \ - data.train_batch_size=${train_prompt_bsz} \ - data.gen_batch_size=${gen_prompt_bsz} \ - data.return_raw_chat=${return_raw_chat} \ - actor_rollout_ref.rollout.n=${n_resp_per_prompt} \ - algorithm.adv_estimator=${adv_estimator} \ - algorithm.use_kl_in_reward=${use_kl_in_reward} \ - algorithm.kl_ctrl.kl_coef=${kl_coef} \ - actor_rollout_ref.actor.strategy=fsdp2 \ - critic.strategy=fsdp2 \ - actor_rollout_ref.actor.use_kl_loss=${use_kl_loss} \ - actor_rollout_ref.actor.kl_loss_coef=${kl_loss_coef} \ - actor_rollout_ref.actor.clip_ratio_low=${clip_ratio_low} \ - actor_rollout_ref.actor.clip_ratio_high=${clip_ratio_high} \ - actor_rollout_ref.actor.clip_ratio_c=10.0 \ - actor_rollout_ref.model.use_remove_padding=True \ - actor_rollout_ref.hybrid_engine=False \ - +actor_rollout_ref.model.override_config.max_position_embeddings=32768 \ - actor_rollout_ref.actor.use_dynamic_bsz=${use_dynamic_bsz} \ - actor_rollout_ref.ref.log_prob_use_dynamic_bsz=${use_dynamic_bsz} \ - actor_rollout_ref.rollout.log_prob_use_dynamic_bsz=${use_dynamic_bsz} \ - actor_rollout_ref.actor.ppo_max_token_len_per_gpu=${actor_ppo_max_token_len} \ - actor_rollout_ref.ref.log_prob_max_token_len_per_gpu=${infer_ppo_max_token_len} \ - actor_rollout_ref.rollout.log_prob_max_token_len_per_gpu=${infer_ppo_max_token_len} \ - actor_rollout_ref.model.path="${MODEL_PATH}" \ - actor_rollout_ref.actor.optim.lr=1e-6 \ - actor_rollout_ref.actor.optim.lr_warmup_steps=10 \ - actor_rollout_ref.actor.optim.weight_decay=0.1 \ - actor_rollout_ref.actor.ppo_mini_batch_size=${train_prompt_mini_bsz} \ - actor_rollout_ref.actor.fsdp_config.param_offload=${actor_offload} \ - actor_rollout_ref.actor.fsdp_config.optimizer_offload=${actor_offload} \ - actor_rollout_ref.actor.entropy_coeff=0 \ - actor_rollout_ref.actor.grad_clip=1.0 \ - actor_rollout_ref.actor.loss_agg_mode=${loss_agg_mode} \ - actor_rollout_ref.actor.ulysses_sequence_parallel_size=${sp_size} \ - actor_rollout_ref.rollout.gpu_memory_utilization=0.80 \ - actor_rollout_ref.rollout.tensor_model_parallel_size=${gen_tp} \ - actor_rollout_ref.rollout.enable_chunked_prefill=True \ - actor_rollout_ref.rollout.max_num_batched_tokens=$((max_prompt_length + max_response_length)) \ - actor_rollout_ref.rollout.temperature=${temperature} \ - actor_rollout_ref.rollout.top_p=${top_p} \ - actor_rollout_ref.rollout.top_k=${top_k} \ - actor_rollout_ref.rollout.val_kwargs.temperature=${temperature} \ - actor_rollout_ref.rollout.val_kwargs.top_p=${val_top_p} \ - actor_rollout_ref.rollout.val_kwargs.top_k=${top_k} \ - actor_rollout_ref.rollout.val_kwargs.do_sample=True \ - actor_rollout_ref.rollout.val_kwargs.n=1 \ - actor_rollout_ref.ref.fsdp_config.param_offload=${ref_offload} \ - actor_rollout_ref.ref.ulysses_sequence_parallel_size=${sp_size} \ - actor_rollout_ref.actor.fsdp_config.fsdp_size=${fsdp_size} \ - actor_rollout_ref.rollout.name=${rollout_name} \ - actor_rollout_ref.rollout.mode=${rollout_mode} \ - actor_rollout_ref.rollout.calculate_log_probs=True \ - reward_model.reward_manager=dapo \ - +reward_model.reward_kwargs.overlong_buffer_cfg.enable=${enable_overlong_buffer} \ - +reward_model.reward_kwargs.overlong_buffer_cfg.len=${overlong_buffer_len} \ - +reward_model.reward_kwargs.overlong_buffer_cfg.penalty_factor=${overlong_penalty_factor} \ - +reward_model.reward_kwargs.overlong_buffer_cfg.log=False \ - +reward_model.reward_kwargs.max_resp_len=${max_response_length} \ - trainer.logger=['console','tensorboard'] \ - trainer.project_name="${project_name}" \ - trainer.experiment_name="${exp_name}" \ - trainer.val_before_train=True \ - trainer.save_freq=-1 \ - trainer.default_local_dir="${CKPTS_DIR}" \ - trainer.resume_mode=auto \ - trainer.nnodes="${NNODES_TRAIN}" \ - trainer.n_gpus_per_node="${NGPUS_PER_NODE}" \ - rollout.nnodes="${NNODES_ROLLOUT}" \ - rollout.n_gpus_per_node="${NGPUS_PER_NODE}" \ - rollout.total_rollout_steps="${total_rollout_steps}" \ - rollout.total_epochs=10 \ - rollout.test_freq="${test_freq}" \ - async_training.staleness_threshold="${staleness_threshold}" \ - async_training.trigger_parameter_sync_step="${trigger_parameter_sync_step}" \ - async_training.partial_rollout="${partial_rollout}" diff --git a/recipe/fully_async_policy/exp/qwen2-32B_128/fsdp2_fully-async_80-48/runtime_env.yaml b/recipe/fully_async_policy/exp/qwen2-32B_128/fsdp2_fully-async_80-48/runtime_env.yaml deleted file mode 100644 index 9997c4130f2..00000000000 --- a/recipe/fully_async_policy/exp/qwen2-32B_128/fsdp2_fully-async_80-48/runtime_env.yaml +++ /dev/null @@ -1,4 +0,0 @@ -env_vars: - VLLM_USE_V1: "1" - TENSORBOARD_DIR: "/mnt/dolphinfs/ssd_pool/docker/user/hadoop-friday-studio/FTI/houzhenggang/expeirments/qwen2-32B-128/dapo_qwen2-32B_20k_fsdp2_fully-async_80-48-tps1" - HYDRA_FULL_ERROR: "1" \ No newline at end of file diff --git a/recipe/fully_async_policy/exp/qwen2-32B_128/fsdp2_fully-async_96-32/run.sh b/recipe/fully_async_policy/exp/qwen2-32B_128/fsdp2_fully-async_96-32/run.sh deleted file mode 100644 index 827e9a30e41..00000000000 --- a/recipe/fully_async_policy/exp/qwen2-32B_128/fsdp2_fully-async_96-32/run.sh +++ /dev/null @@ -1,153 +0,0 @@ -#!/usr/bin/env bash -set -xeuo pipefail - -project_name='DAPO' -exp_name='dapo_qwen2-32B_20k_fsdp2_fully-async_96-32-tps1' - -# Paths -MODEL_PATH=/mnt/dolphinfs/ssd_pool/docker/user/hadoop-friday-studio/.friday/models/Qwen2.5-32B -CKPTS_DIR=./ckpts/${project_name}/${exp_name} -TRAIN_FILE=/mnt/dolphinfs/ssd_pool/docker/user/hadoop-friday-studio/FTI/houzhenggang/data/dapo/dapo-math-17k.parquet -TEST_FILE=/mnt/dolphinfs/ssd_pool/docker/user/hadoop-friday-studio/FTI/houzhenggang/data/dapo/aime-2024.parquet - -rollout_mode="async" -rollout_name="vllm" # sglang or vllm -if [ "$rollout_mode" = "async" ]; then - export VLLM_USE_V1=1 - return_raw_chat="True" -fi - -# Algorithm parameters -adv_estimator=grpo - -use_kl_in_reward=False -kl_coef=0.0 -use_kl_loss=False -kl_loss_coef=0.0 - -clip_ratio_low=0.2 -clip_ratio_high=0.28 - -# Response length parameters -max_prompt_length=$((1024 * 2)) -max_response_length=$((1024 * 20)) -enable_overlong_buffer=True -overlong_buffer_len=$((1024 * 4)) -overlong_penalty_factor=1.0 - -# Training parameters -loss_agg_mode="token-mean" - -# Algorithm -temperature=1.0 -top_p=1.0 -top_k=-1 # 0 for HF rollout, -1 for vLLM rollout -val_top_p=0.7 - -# Performance Related Parameter -use_dynamic_bsz=True -actor_ppo_max_token_len=$(((max_prompt_length + max_response_length) * 2)) -infer_ppo_max_token_len=$(((max_prompt_length + max_response_length) * 3)) -ref_offload=True -actor_offload=False -gen_tp=4 -sp_size=8 -fsdp_size=-1 - -# Fully async specific parameters -NNODES_ROLLOUT=${NNODES_ROLLOUT:-12} -NNODES_TRAIN=${NNODES_TRAIN:-4} -NGPUS_PER_NODE=${NGPUS_PER_NODE:-8} - -train_prompt_bsz=0 -gen_prompt_bsz=1 -n_resp_per_prompt=16 -train_prompt_mini_bsz=128 -total_rollout_steps=$(((512*400))) -test_freq=20 -staleness_threshold=0.1 -trigger_parameter_sync_step=2 -partial_rollout=True - -python -m recipe.fully_async_policy.fully_async_main \ - data.train_files="${TRAIN_FILE}" \ - data.val_files="${TEST_FILE}" \ - data.prompt_key=prompt \ - data.truncation='left' \ - data.max_prompt_length=${max_prompt_length} \ - data.max_response_length=${max_response_length} \ - data.train_batch_size=${train_prompt_bsz} \ - data.gen_batch_size=${gen_prompt_bsz} \ - data.return_raw_chat=${return_raw_chat} \ - actor_rollout_ref.rollout.n=${n_resp_per_prompt} \ - algorithm.adv_estimator=${adv_estimator} \ - algorithm.use_kl_in_reward=${use_kl_in_reward} \ - algorithm.kl_ctrl.kl_coef=${kl_coef} \ - actor_rollout_ref.actor.strategy=fsdp2 \ - critic.strategy=fsdp2 \ - actor_rollout_ref.actor.use_kl_loss=${use_kl_loss} \ - actor_rollout_ref.actor.kl_loss_coef=${kl_loss_coef} \ - actor_rollout_ref.actor.clip_ratio_low=${clip_ratio_low} \ - actor_rollout_ref.actor.clip_ratio_high=${clip_ratio_high} \ - actor_rollout_ref.actor.clip_ratio_c=10.0 \ - actor_rollout_ref.model.use_remove_padding=True \ - actor_rollout_ref.hybrid_engine=False \ - +actor_rollout_ref.model.override_config.max_position_embeddings=32768 \ - actor_rollout_ref.actor.use_dynamic_bsz=${use_dynamic_bsz} \ - actor_rollout_ref.ref.log_prob_use_dynamic_bsz=${use_dynamic_bsz} \ - actor_rollout_ref.rollout.log_prob_use_dynamic_bsz=${use_dynamic_bsz} \ - actor_rollout_ref.actor.ppo_max_token_len_per_gpu=${actor_ppo_max_token_len} \ - actor_rollout_ref.ref.log_prob_max_token_len_per_gpu=${infer_ppo_max_token_len} \ - actor_rollout_ref.rollout.log_prob_max_token_len_per_gpu=${infer_ppo_max_token_len} \ - actor_rollout_ref.model.path="${MODEL_PATH}" \ - actor_rollout_ref.actor.optim.lr=1e-6 \ - actor_rollout_ref.actor.optim.lr_warmup_steps=10 \ - actor_rollout_ref.actor.optim.weight_decay=0.1 \ - actor_rollout_ref.actor.ppo_mini_batch_size=${train_prompt_mini_bsz} \ - actor_rollout_ref.actor.fsdp_config.param_offload=${actor_offload} \ - actor_rollout_ref.actor.fsdp_config.optimizer_offload=${actor_offload} \ - actor_rollout_ref.actor.entropy_coeff=0 \ - actor_rollout_ref.actor.grad_clip=1.0 \ - actor_rollout_ref.actor.loss_agg_mode=${loss_agg_mode} \ - actor_rollout_ref.actor.ulysses_sequence_parallel_size=${sp_size} \ - actor_rollout_ref.rollout.gpu_memory_utilization=0.80 \ - actor_rollout_ref.rollout.tensor_model_parallel_size=${gen_tp} \ - actor_rollout_ref.rollout.enable_chunked_prefill=True \ - actor_rollout_ref.rollout.max_num_batched_tokens=$((max_prompt_length + max_response_length)) \ - actor_rollout_ref.rollout.temperature=${temperature} \ - actor_rollout_ref.rollout.top_p=${top_p} \ - actor_rollout_ref.rollout.top_k=${top_k} \ - actor_rollout_ref.rollout.val_kwargs.temperature=${temperature} \ - actor_rollout_ref.rollout.val_kwargs.top_p=${val_top_p} \ - actor_rollout_ref.rollout.val_kwargs.top_k=${top_k} \ - actor_rollout_ref.rollout.val_kwargs.do_sample=True \ - actor_rollout_ref.rollout.val_kwargs.n=1 \ - actor_rollout_ref.ref.fsdp_config.param_offload=${ref_offload} \ - actor_rollout_ref.ref.ulysses_sequence_parallel_size=${sp_size} \ - actor_rollout_ref.actor.fsdp_config.fsdp_size=${fsdp_size} \ - actor_rollout_ref.rollout.name=${rollout_name} \ - actor_rollout_ref.rollout.mode=${rollout_mode} \ - actor_rollout_ref.rollout.calculate_log_probs=True \ - reward_model.reward_manager=dapo \ - +reward_model.reward_kwargs.overlong_buffer_cfg.enable=${enable_overlong_buffer} \ - +reward_model.reward_kwargs.overlong_buffer_cfg.len=${overlong_buffer_len} \ - +reward_model.reward_kwargs.overlong_buffer_cfg.penalty_factor=${overlong_penalty_factor} \ - +reward_model.reward_kwargs.overlong_buffer_cfg.log=False \ - +reward_model.reward_kwargs.max_resp_len=${max_response_length} \ - trainer.logger=['console','tensorboard'] \ - trainer.project_name="${project_name}" \ - trainer.experiment_name="${exp_name}" \ - trainer.val_before_train=True \ - trainer.save_freq=-1 \ - trainer.default_local_dir="${CKPTS_DIR}" \ - trainer.resume_mode=auto \ - trainer.nnodes="${NNODES_TRAIN}" \ - trainer.n_gpus_per_node="${NGPUS_PER_NODE}" \ - rollout.nnodes="${NNODES_ROLLOUT}" \ - rollout.n_gpus_per_node="${NGPUS_PER_NODE}" \ - rollout.total_rollout_steps="${total_rollout_steps}" \ - rollout.total_epochs=10 \ - rollout.test_freq="${test_freq}" \ - async_training.staleness_threshold="${staleness_threshold}" \ - async_training.trigger_parameter_sync_step="${trigger_parameter_sync_step}" \ - async_training.partial_rollout="${partial_rollout}" diff --git a/recipe/fully_async_policy/exp/qwen2-32B_128/fsdp2_fully-async_96-32/runtime_env.yaml b/recipe/fully_async_policy/exp/qwen2-32B_128/fsdp2_fully-async_96-32/runtime_env.yaml deleted file mode 100644 index be4ab6a6349..00000000000 --- a/recipe/fully_async_policy/exp/qwen2-32B_128/fsdp2_fully-async_96-32/runtime_env.yaml +++ /dev/null @@ -1,4 +0,0 @@ -env_vars: - VLLM_USE_V1: "1" - TENSORBOARD_DIR: "/mnt/dolphinfs/ssd_pool/docker/user/hadoop-friday-studio/FTI/houzhenggang/expeirments/qwen2-32B-128/dapo_qwen2-32B_20k_fsdp2_fully-async_96-32-tps1" - HYDRA_FULL_ERROR: "1" \ No newline at end of file diff --git a/recipe/fully_async_policy/exp/qwen2-7B-math_128/fsdp2_colocate/run.sh b/recipe/fully_async_policy/exp/qwen2-7B-math_128/fsdp2_colocate/run.sh deleted file mode 100644 index 3538722d8a1..00000000000 --- a/recipe/fully_async_policy/exp/qwen2-7B-math_128/fsdp2_colocate/run.sh +++ /dev/null @@ -1,133 +0,0 @@ -#!/usr/bin/env bash -set -xeuo pipefail - -project_name='DAPO' -exp_name='dapo_qwen2-7B-math_28k_fsdp2_colocate_128_mbs32' - -adv_estimator=grpo - -use_kl_in_reward=False -kl_coef=0.0 -use_kl_loss=False -kl_loss_coef=0.0 - -clip_ratio_low=0.2 -clip_ratio_high=0.28 - -max_prompt_length=$((1024 * 2)) -max_response_length=$((1024 * 28)) -enable_overlong_buffer=True -overlong_buffer_len=$((1024 * 4)) -overlong_penalty_factor=1.0 - -loss_agg_mode="token-mean" - -train_prompt_bsz=512 -n_resp_per_prompt=16 -train_prompt_mini_bsz=32 - -# Ray -# RAY_ADDRESS=${RAY_ADDRESS:-"http://localhost:8265"} -# WORKING_DIR=${WORKING_DIR:-"${PWD}"} -# RUNTIME_ENV=${RUNTIME_ENV:-"${WORKING_DIR}/verl/trainer/runtime_env.yaml"} -NNODES=${NNODES:-16} -NGPUS_PER_NODE=${NGPUS_PER_NODE:-8} -# Paths -RAY_DATA_HOME=${RAY_DATA_HOME:-"${HOME}/verl"} -# very important! please modify the max_position_embeddings in config.json to 32768 after downloading from huggingface - -MODEL_PATH=/mnt/dolphinfs/ssd_pool/docker/user/hadoop-friday-studio/FTI/houzhenggang/model/Qwen2___5-Math-7B -CKPTS_DIR=./ckpts/${project_name}/${exp_name} -TRAIN_FILE=/mnt/dolphinfs/ssd_pool/docker/user/hadoop-friday-studio/FTI/houzhenggang/data/dapo/dapo-math-17k.parquet -TEST_FILE=/mnt/dolphinfs/ssd_pool/docker/user/hadoop-friday-studio/FTI/houzhenggang/data/dapo/aime-2024.parquet -# Algorithm -temperature=1.0 -top_p=1.0 -top_k=-1 # 0 for HF rollout, -1 for vLLM rollout -val_top_p=0.7 - -# Performance Related Parameter -use_dynamic_bsz=True -actor_ppo_max_token_len=$(((max_prompt_length + max_response_length) * 2)) -infer_ppo_max_token_len=$(((max_prompt_length + max_response_length) * 3)) -offload=True -gen_tp=4 -sp_size=4 -fsdp_size=2 - -# reference run wandb: https://wandb.ai/verl-org/DAPO%20Reproduction%20on%20verl/runs/ow47vvon?nw=nwusertongyuxuan361 - - -python -m verl.trainer.main_ppo \ - data.train_files="${TRAIN_FILE}" \ - data.val_files="${TEST_FILE}" \ - data.prompt_key=prompt \ - data.truncation='left' \ - data.max_prompt_length=${max_prompt_length} \ - data.max_response_length=${max_response_length} \ - data.train_batch_size=${train_prompt_bsz} \ - actor_rollout_ref.rollout.n=${n_resp_per_prompt} \ - algorithm.adv_estimator=${adv_estimator} \ - algorithm.use_kl_in_reward=${use_kl_in_reward} \ - algorithm.kl_ctrl.kl_coef=${kl_coef} \ - actor_rollout_ref.actor.strategy=fsdp2 \ - critic.strategy=fsdp2 \ - actor_rollout_ref.actor.use_kl_loss=${use_kl_loss} \ - actor_rollout_ref.actor.kl_loss_coef=${kl_loss_coef} \ - actor_rollout_ref.actor.clip_ratio_low=${clip_ratio_low} \ - actor_rollout_ref.actor.clip_ratio_high=${clip_ratio_high} \ - actor_rollout_ref.actor.clip_ratio_c=10.0 \ - actor_rollout_ref.model.use_remove_padding=True \ - +actor_rollout_ref.model.override_config.max_position_embeddings=32768 \ - actor_rollout_ref.actor.use_dynamic_bsz=${use_dynamic_bsz} \ - actor_rollout_ref.ref.log_prob_use_dynamic_bsz=${use_dynamic_bsz} \ - actor_rollout_ref.rollout.log_prob_use_dynamic_bsz=${use_dynamic_bsz} \ - actor_rollout_ref.actor.ppo_max_token_len_per_gpu=${actor_ppo_max_token_len} \ - actor_rollout_ref.ref.log_prob_max_token_len_per_gpu=${infer_ppo_max_token_len} \ - actor_rollout_ref.rollout.log_prob_max_token_len_per_gpu=${infer_ppo_max_token_len} \ - actor_rollout_ref.model.path="${MODEL_PATH}" \ - actor_rollout_ref.model.enable_gradient_checkpointing=True \ - actor_rollout_ref.actor.optim.lr=1e-6 \ - actor_rollout_ref.actor.optim.lr_warmup_steps=10 \ - actor_rollout_ref.actor.optim.weight_decay=0.1 \ - actor_rollout_ref.actor.ppo_mini_batch_size=${train_prompt_mini_bsz} \ - actor_rollout_ref.actor.fsdp_config.param_offload=${offload} \ - actor_rollout_ref.actor.fsdp_config.optimizer_offload=${offload} \ - actor_rollout_ref.actor.entropy_coeff=0 \ - actor_rollout_ref.actor.grad_clip=1.0 \ - actor_rollout_ref.actor.loss_agg_mode=${loss_agg_mode} \ - actor_rollout_ref.actor.ulysses_sequence_parallel_size=${sp_size} \ - actor_rollout_ref.rollout.gpu_memory_utilization=0.80 \ - actor_rollout_ref.rollout.tensor_model_parallel_size=${gen_tp} \ - actor_rollout_ref.rollout.enable_chunked_prefill=True \ - actor_rollout_ref.rollout.max_num_batched_tokens=$((max_prompt_length + max_response_length)) \ - actor_rollout_ref.rollout.temperature=${temperature} \ - actor_rollout_ref.rollout.top_p=${top_p} \ - actor_rollout_ref.rollout.top_k=${top_k} \ - actor_rollout_ref.rollout.val_kwargs.temperature=${temperature} \ - actor_rollout_ref.rollout.val_kwargs.top_p=${val_top_p} \ - actor_rollout_ref.rollout.val_kwargs.top_k=${top_k} \ - actor_rollout_ref.rollout.val_kwargs.do_sample=True \ - actor_rollout_ref.rollout.val_kwargs.n=1 \ - actor_rollout_ref.ref.fsdp_config.param_offload=${offload} \ - actor_rollout_ref.ref.ulysses_sequence_parallel_size=${sp_size} \ - actor_rollout_ref.actor.fsdp_config.fsdp_size=${fsdp_size} \ - reward_model.reward_manager=dapo \ - +reward_model.reward_kwargs.overlong_buffer_cfg.enable=${enable_overlong_buffer} \ - +reward_model.reward_kwargs.overlong_buffer_cfg.len=${overlong_buffer_len} \ - +reward_model.reward_kwargs.overlong_buffer_cfg.penalty_factor=${overlong_penalty_factor} \ - +reward_model.reward_kwargs.overlong_buffer_cfg.log=False \ - +reward_model.reward_kwargs.max_resp_len=${max_response_length} \ - trainer.logger=['console','tensorboard'] \ - trainer.project_name="${project_name}" \ - trainer.experiment_name="${exp_name}" \ - trainer.n_gpus_per_node="${NGPUS_PER_NODE}" \ - trainer.nnodes="${NNODES}" \ - trainer.val_before_train=True \ - trainer.test_freq=20 \ - trainer.save_freq=-1 \ - trainer.total_epochs=10 \ - trainer.total_training_steps=400 \ - trainer.default_local_dir="${CKPTS_DIR}" \ - trainer.resume_mode=auto \ - trainer.log_val_generations=10 diff --git a/recipe/fully_async_policy/exp/qwen2-7B-math_128/fsdp2_colocate/runtime_env.yaml b/recipe/fully_async_policy/exp/qwen2-7B-math_128/fsdp2_colocate/runtime_env.yaml deleted file mode 100644 index 8fc2de3e70b..00000000000 --- a/recipe/fully_async_policy/exp/qwen2-7B-math_128/fsdp2_colocate/runtime_env.yaml +++ /dev/null @@ -1,3 +0,0 @@ -env_vars: - TENSORBOARD_DIR: "/mnt/dolphinfs/ssd_pool/docker/user/hadoop-friday-studio/FTI/houzhenggang/expeirments/qwen2-7B-math-128/dapo_qwen2-7B-math_28k_fsdp2_colocate_128_mbs32" - HYDRA_FULL_ERROR: "1" \ No newline at end of file diff --git a/recipe/fully_async_policy/exp/qwen2-7B-math_128/fsdp2_fully-async_64-64/runtime_env.yaml b/recipe/fully_async_policy/exp/qwen2-7B-math_128/fsdp2_fully-async_64-64/runtime_env.yaml deleted file mode 100644 index 5dfe2294911..00000000000 --- a/recipe/fully_async_policy/exp/qwen2-7B-math_128/fsdp2_fully-async_64-64/runtime_env.yaml +++ /dev/null @@ -1,4 +0,0 @@ -env_vars: - VLLM_USE_V1: "1" - TENSORBOARD_DIR: "/mnt/dolphinfs/ssd_pool/docker/user/hadoop-friday-studio/FTI/houzhenggang/expeirments/qwen2-7B-math-128/dapo_qwen2-7B-math_28k_fsdp2_fully-async_64-64_mbs32_tpf4_fixmcs" - HYDRA_FULL_ERROR: "1" \ No newline at end of file diff --git a/recipe/fully_async_policy/exp/qwen2-7B-math_128/fsdp2_fully-async_64-64_stal0.1/runtime_env.yaml b/recipe/fully_async_policy/exp/qwen2-7B-math_128/fsdp2_fully-async_64-64_stal0.1/runtime_env.yaml deleted file mode 100644 index 92bacbdd204..00000000000 --- a/recipe/fully_async_policy/exp/qwen2-7B-math_128/fsdp2_fully-async_64-64_stal0.1/runtime_env.yaml +++ /dev/null @@ -1,4 +0,0 @@ -env_vars: - VLLM_USE_V1: "1" - TENSORBOARD_DIR: "/mnt/dolphinfs/ssd_pool/docker/user/hadoop-friday-studio/FTI/houzhenggang/expeirments/qwen2-7B-math-128/dapo_qwen2-7B-math_28k_fsdp2_fully-async_64-64_mbs32_tpf4_stal0.1" - HYDRA_FULL_ERROR: "1" \ No newline at end of file diff --git a/recipe/fully_async_policy/exp/qwen2-7B-math_128/megatron_colocate/run.sh b/recipe/fully_async_policy/exp/qwen2-7B-math_128/megatron_colocate/run.sh deleted file mode 100644 index f98aeb86b57..00000000000 --- a/recipe/fully_async_policy/exp/qwen2-7B-math_128/megatron_colocate/run.sh +++ /dev/null @@ -1,135 +0,0 @@ -#!/usr/bin/env bash -set -xeuo pipefail - -project_name='DAPO' -exp_name='dapo_qwen2-7B-math_28k_megatron_colocate_64_mbs32' - -adv_estimator=grpo - -use_kl_in_reward=False -kl_coef=0.0 -use_kl_loss=False -kl_loss_coef=0.0 - -clip_ratio_low=0.2 -clip_ratio_high=0.28 - -max_prompt_length=$((1024 * 2)) -max_response_length=$((1024 * 28)) -enable_overlong_buffer=True -overlong_buffer_len=$((1024 * 4)) -overlong_penalty_factor=1.0 - -loss_agg_mode="token-mean" - -train_prompt_bsz=512 -n_resp_per_prompt=16 -train_prompt_mini_bsz=32 - -# Ray -# RAY_ADDRESS=${RAY_ADDRESS:-"http://localhost:8265"} -# WORKING_DIR=${WORKING_DIR:-"${PWD}"} -# RUNTIME_ENV=${RUNTIME_ENV:-"${WORKING_DIR}/verl/trainer/runtime_env.yaml"} -NNODES=${NNODES:-8} -NGPUS_PER_NODE=${NGPUS_PER_NODE:-8} -# Paths -MODEL_PATH=/mnt/dolphinfs/ssd_pool/docker/user/hadoop-friday-studio/FTI/houzhenggang/model/Qwen2___5-Math-7B -CKPTS_DIR=./ckpts/${project_name}/${exp_name} -TRAIN_FILE=/mnt/dolphinfs/ssd_pool/docker/user/hadoop-friday-studio/FTI/houzhenggang/data/dapo/dapo-math-17k.parquet -TEST_FILE=/mnt/dolphinfs/ssd_pool/docker/user/hadoop-friday-studio/FTI/houzhenggang/data/dapo/aime-2024.parquet - -# Algorithm -temperature=1.0 -top_p=1.0 -top_k=-1 # 0 for HF rollout, -1 for vLLM rollout -val_top_p=0.7 - -# Performance Related Parameter -use_dynamic_bsz=True -actor_ppo_max_token_len=$(((max_prompt_length + max_response_length) * 2)) -infer_ppo_max_token_len=$(((max_prompt_length + max_response_length) * 3)) -offload=True -gen_tp=4 -train_tp=4 -train_pp=2 - -# TODO: support dynamic_bsz for megatron -# actor_rollout_ref.actor.use_dynamic_bsz=${use_dynamic_bsz} \ -# actor_rollout_ref.ref.log_prob_use_dynamic_bsz=${use_dynamic_bsz} \ -# actor_rollout_ref.rollout.log_prob_use_dynamic_bsz=${use_dynamic_bsz} \ -# actor_rollout_ref.actor.ppo_max_token_len_per_gpu=${actor_ppo_max_token_len} \ -# actor_rollout_ref.ref.log_prob_max_token_len_per_gpu=${infer_ppo_max_token_len} \ -# actor_rollout_ref.rollout.log_prob_max_token_len_per_gpu=${infer_ppo_max_token_len} \ - -python3 -m verl.trainer.main_ppo \ - --config-path=config \ - --config-name='ppo_megatron_trainer.yaml' \ - data.train_files="${TRAIN_FILE}" \ - data.val_files="${TEST_FILE}" \ - data.prompt_key=prompt \ - data.truncation='left' \ - data.max_prompt_length=${max_prompt_length} \ - data.max_response_length=${max_response_length} \ - data.train_batch_size=${train_prompt_bsz} \ - actor_rollout_ref.rollout.n=${n_resp_per_prompt} \ - algorithm.adv_estimator=${adv_estimator} \ - algorithm.use_kl_in_reward=${use_kl_in_reward} \ - algorithm.kl_ctrl.kl_coef=${kl_coef} \ - actor_rollout_ref.actor.strategy=megatron \ - critic.strategy=megatron \ - actor_rollout_ref.actor.use_kl_loss=${use_kl_loss} \ - actor_rollout_ref.actor.kl_loss_coef=${kl_loss_coef} \ - actor_rollout_ref.actor.clip_ratio_low=${clip_ratio_low} \ - actor_rollout_ref.actor.clip_ratio_high=${clip_ratio_high} \ - actor_rollout_ref.actor.clip_ratio_c=10.0 \ - +actor_rollout_ref.model.override_config.max_position_embeddings=32768 \ - actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=2 \ - actor_rollout_ref.ref.log_prob_micro_batch_size_per_gpu=4 \ - actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu=4 \ - actor_rollout_ref.model.path="${MODEL_PATH}" \ - actor_rollout_ref.actor.optim.lr=1e-6 \ - actor_rollout_ref.actor.optim.lr_warmup_steps=10 \ - actor_rollout_ref.actor.optim.weight_decay=0.1 \ - actor_rollout_ref.actor.ppo_mini_batch_size=${train_prompt_mini_bsz} \ - actor_rollout_ref.actor.megatron.param_offload=${offload} \ - actor_rollout_ref.actor.megatron.optimizer_offload=${offload} \ - actor_rollout_ref.actor.megatron.grad_offload=${offload} \ - actor_rollout_ref.actor.megatron.pipeline_model_parallel_size=${train_pp} \ - actor_rollout_ref.actor.megatron.tensor_model_parallel_size=${train_tp} \ - actor_rollout_ref.actor.entropy_coeff=0 \ - actor_rollout_ref.actor.optim.clip_grad=1.0 \ - actor_rollout_ref.actor.loss_agg_mode=${loss_agg_mode} \ - actor_rollout_ref.rollout.gpu_memory_utilization=0.80 \ - actor_rollout_ref.rollout.tensor_model_parallel_size=${gen_tp} \ - actor_rollout_ref.rollout.enable_chunked_prefill=True \ - actor_rollout_ref.rollout.max_num_batched_tokens=$((max_prompt_length + max_response_length)) \ - actor_rollout_ref.rollout.temperature=${temperature} \ - actor_rollout_ref.rollout.top_p=${top_p} \ - actor_rollout_ref.rollout.top_k=${top_k} \ - actor_rollout_ref.rollout.val_kwargs.temperature=${temperature} \ - actor_rollout_ref.rollout.val_kwargs.top_p=${val_top_p} \ - actor_rollout_ref.rollout.val_kwargs.top_k=${top_k} \ - actor_rollout_ref.rollout.val_kwargs.do_sample=True \ - actor_rollout_ref.rollout.val_kwargs.n=1 \ - actor_rollout_ref.ref.megatron.pipeline_model_parallel_size=${train_pp} \ - actor_rollout_ref.ref.megatron.tensor_model_parallel_size=${train_tp} \ - actor_rollout_ref.ref.megatron.param_offload=${offload} \ - reward_model.reward_manager=dapo \ - +reward_model.reward_kwargs.overlong_buffer_cfg.enable=${enable_overlong_buffer} \ - +reward_model.reward_kwargs.overlong_buffer_cfg.len=${overlong_buffer_len} \ - +reward_model.reward_kwargs.overlong_buffer_cfg.penalty_factor=${overlong_penalty_factor} \ - +reward_model.reward_kwargs.overlong_buffer_cfg.log=False \ - +reward_model.reward_kwargs.max_resp_len=${max_response_length} \ - trainer.logger=['console','tensorboard'] \ - trainer.project_name="${project_name}" \ - trainer.experiment_name="${exp_name}" \ - trainer.n_gpus_per_node=8 \ - trainer.nnodes="${NNODES}" \ - trainer.val_before_train=True \ - trainer.test_freq=20 \ - trainer.save_freq=-1 \ - trainer.total_epochs=10 \ - trainer.total_training_steps=400 \ - trainer.default_local_dir="${CKPTS_DIR}" \ - trainer.resume_mode=auto \ - trainer.log_val_generations=10 diff --git a/recipe/fully_async_policy/exp/qwen2-7B-math_128/megatron_colocate/runtime_env.yaml b/recipe/fully_async_policy/exp/qwen2-7B-math_128/megatron_colocate/runtime_env.yaml deleted file mode 100644 index 6e33f46a65a..00000000000 --- a/recipe/fully_async_policy/exp/qwen2-7B-math_128/megatron_colocate/runtime_env.yaml +++ /dev/null @@ -1,3 +0,0 @@ -env_vars: - TENSORBOARD_DIR: "/mnt/dolphinfs/ssd_pool/docker/user/hadoop-friday-studio/FTI/houzhenggang/expeirments/qwen2-7B-math-128/dapo_qwen2-7B-math_28k_megatron_colocate_128_mbs32" - HYDRA_FULL_ERROR: "1" \ No newline at end of file diff --git a/recipe/fully_async_policy/exp/qwen2-7B-math_32/fsdp2_colocate/run.sh b/recipe/fully_async_policy/exp/qwen2-7B-math_32/fsdp2_colocate/run.sh deleted file mode 100644 index 8d42dca04ca..00000000000 --- a/recipe/fully_async_policy/exp/qwen2-7B-math_32/fsdp2_colocate/run.sh +++ /dev/null @@ -1,133 +0,0 @@ -#!/usr/bin/env bash -set -xeuo pipefail - -project_name='DAPO' -exp_name='dapo_qwen2-7B-math_28k_fsdp2_colocate_32_mbs32' - -adv_estimator=grpo - -use_kl_in_reward=False -kl_coef=0.0 -use_kl_loss=False -kl_loss_coef=0.0 - -clip_ratio_low=0.2 -clip_ratio_high=0.28 - -max_prompt_length=$((1024 * 2)) -max_response_length=$((1024 * 28)) -enable_overlong_buffer=True -overlong_buffer_len=$((1024 * 4)) -overlong_penalty_factor=1.0 - -loss_agg_mode="token-mean" - -train_prompt_bsz=512 -n_resp_per_prompt=16 -train_prompt_mini_bsz=32 - -# Ray -# RAY_ADDRESS=${RAY_ADDRESS:-"http://localhost:8265"} -# WORKING_DIR=${WORKING_DIR:-"${PWD}"} -# RUNTIME_ENV=${RUNTIME_ENV:-"${WORKING_DIR}/verl/trainer/runtime_env.yaml"} -NNODES=${NNODES:-4} -NGPUS_PER_NODE=${NGPUS_PER_NODE:-8} -# Paths -RAY_DATA_HOME=${RAY_DATA_HOME:-"${HOME}/verl"} -# very important! please modify the max_position_embeddings in config.json to 32768 after downloading from huggingface - -MODEL_PATH=/mnt/dolphinfs/ssd_pool/docker/user/hadoop-friday-studio/FTI/houzhenggang/model/Qwen2___5-Math-7B -CKPTS_DIR=./ckpts/${project_name}/${exp_name} -TRAIN_FILE=/mnt/dolphinfs/ssd_pool/docker/user/hadoop-friday-studio/FTI/houzhenggang/data/dapo/dapo-math-17k.parquet -TEST_FILE=/mnt/dolphinfs/ssd_pool/docker/user/hadoop-friday-studio/FTI/houzhenggang/data/dapo/aime-2024.parquet -# Algorithm -temperature=1.0 -top_p=1.0 -top_k=-1 # 0 for HF rollout, -1 for vLLM rollout -val_top_p=0.7 - -# Performance Related Parameter -use_dynamic_bsz=True -actor_ppo_max_token_len=$(((max_prompt_length + max_response_length) * 2)) -infer_ppo_max_token_len=$(((max_prompt_length + max_response_length) * 3)) -offload=True -gen_tp=4 -sp_size=4 -fsdp_size=2 - -# reference run wandb: https://wandb.ai/verl-org/DAPO%20Reproduction%20on%20verl/runs/ow47vvon?nw=nwusertongyuxuan361 - - -python -m verl.trainer.main_ppo \ - data.train_files="${TRAIN_FILE}" \ - data.val_files="${TEST_FILE}" \ - data.prompt_key=prompt \ - data.truncation='left' \ - data.max_prompt_length=${max_prompt_length} \ - data.max_response_length=${max_response_length} \ - data.train_batch_size=${train_prompt_bsz} \ - actor_rollout_ref.rollout.n=${n_resp_per_prompt} \ - algorithm.adv_estimator=${adv_estimator} \ - algorithm.use_kl_in_reward=${use_kl_in_reward} \ - algorithm.kl_ctrl.kl_coef=${kl_coef} \ - actor_rollout_ref.actor.strategy=fsdp2 \ - critic.strategy=fsdp2 \ - actor_rollout_ref.actor.use_kl_loss=${use_kl_loss} \ - actor_rollout_ref.actor.kl_loss_coef=${kl_loss_coef} \ - actor_rollout_ref.actor.clip_ratio_low=${clip_ratio_low} \ - actor_rollout_ref.actor.clip_ratio_high=${clip_ratio_high} \ - actor_rollout_ref.actor.clip_ratio_c=10.0 \ - actor_rollout_ref.model.use_remove_padding=True \ - +actor_rollout_ref.model.override_config.max_position_embeddings=32768 \ - actor_rollout_ref.actor.use_dynamic_bsz=${use_dynamic_bsz} \ - actor_rollout_ref.ref.log_prob_use_dynamic_bsz=${use_dynamic_bsz} \ - actor_rollout_ref.rollout.log_prob_use_dynamic_bsz=${use_dynamic_bsz} \ - actor_rollout_ref.actor.ppo_max_token_len_per_gpu=${actor_ppo_max_token_len} \ - actor_rollout_ref.ref.log_prob_max_token_len_per_gpu=${infer_ppo_max_token_len} \ - actor_rollout_ref.rollout.log_prob_max_token_len_per_gpu=${infer_ppo_max_token_len} \ - actor_rollout_ref.model.path="${MODEL_PATH}" \ - actor_rollout_ref.model.enable_gradient_checkpointing=True \ - actor_rollout_ref.actor.optim.lr=1e-6 \ - actor_rollout_ref.actor.optim.lr_warmup_steps=10 \ - actor_rollout_ref.actor.optim.weight_decay=0.1 \ - actor_rollout_ref.actor.ppo_mini_batch_size=${train_prompt_mini_bsz} \ - actor_rollout_ref.actor.fsdp_config.param_offload=${offload} \ - actor_rollout_ref.actor.fsdp_config.optimizer_offload=${offload} \ - actor_rollout_ref.actor.entropy_coeff=0 \ - actor_rollout_ref.actor.grad_clip=1.0 \ - actor_rollout_ref.actor.loss_agg_mode=${loss_agg_mode} \ - actor_rollout_ref.actor.ulysses_sequence_parallel_size=${sp_size} \ - actor_rollout_ref.rollout.gpu_memory_utilization=0.80 \ - actor_rollout_ref.rollout.tensor_model_parallel_size=${gen_tp} \ - actor_rollout_ref.rollout.enable_chunked_prefill=True \ - actor_rollout_ref.rollout.max_num_batched_tokens=$((max_prompt_length + max_response_length)) \ - actor_rollout_ref.rollout.temperature=${temperature} \ - actor_rollout_ref.rollout.top_p=${top_p} \ - actor_rollout_ref.rollout.top_k=${top_k} \ - actor_rollout_ref.rollout.val_kwargs.temperature=${temperature} \ - actor_rollout_ref.rollout.val_kwargs.top_p=${val_top_p} \ - actor_rollout_ref.rollout.val_kwargs.top_k=${top_k} \ - actor_rollout_ref.rollout.val_kwargs.do_sample=True \ - actor_rollout_ref.rollout.val_kwargs.n=1 \ - actor_rollout_ref.ref.fsdp_config.param_offload=${offload} \ - actor_rollout_ref.ref.ulysses_sequence_parallel_size=${sp_size} \ - actor_rollout_ref.actor.fsdp_config.fsdp_size=${fsdp_size} \ - reward_model.reward_manager=dapo \ - +reward_model.reward_kwargs.overlong_buffer_cfg.enable=${enable_overlong_buffer} \ - +reward_model.reward_kwargs.overlong_buffer_cfg.len=${overlong_buffer_len} \ - +reward_model.reward_kwargs.overlong_buffer_cfg.penalty_factor=${overlong_penalty_factor} \ - +reward_model.reward_kwargs.overlong_buffer_cfg.log=False \ - +reward_model.reward_kwargs.max_resp_len=${max_response_length} \ - trainer.logger=['console','tensorboard'] \ - trainer.project_name="${project_name}" \ - trainer.experiment_name="${exp_name}" \ - trainer.n_gpus_per_node="${NGPUS_PER_NODE}" \ - trainer.nnodes="${NNODES}" \ - trainer.val_before_train=True \ - trainer.test_freq=20 \ - trainer.save_freq=-1 \ - trainer.total_epochs=10 \ - trainer.total_training_steps=400 \ - trainer.default_local_dir="${CKPTS_DIR}" \ - trainer.resume_mode=auto \ - trainer.log_val_generations=10 diff --git a/recipe/fully_async_policy/exp/qwen2-7B-math_32/fsdp2_colocate/runtime_env.yaml b/recipe/fully_async_policy/exp/qwen2-7B-math_32/fsdp2_colocate/runtime_env.yaml deleted file mode 100644 index 39c5a3593e8..00000000000 --- a/recipe/fully_async_policy/exp/qwen2-7B-math_32/fsdp2_colocate/runtime_env.yaml +++ /dev/null @@ -1,3 +0,0 @@ -env_vars: - TENSORBOARD_DIR: "/mnt/dolphinfs/ssd_pool/docker/user/hadoop-friday-studio/FTI/houzhenggang/expeirments/qwen2-7B-math/dapo_qwen2-7B-math_28k_fsdp2_colocate_32_mbs32" - HYDRA_FULL_ERROR: "1" \ No newline at end of file diff --git a/recipe/fully_async_policy/exp/qwen2-7B-math_32/fsdp2_fully-async_16-16/run.sh b/recipe/fully_async_policy/exp/qwen2-7B-math_32/fsdp2_fully-async_16-16/run.sh deleted file mode 100644 index 9fca6da9878..00000000000 --- a/recipe/fully_async_policy/exp/qwen2-7B-math_32/fsdp2_fully-async_16-16/run.sh +++ /dev/null @@ -1,154 +0,0 @@ -#!/usr/bin/env bash -set -xeuo pipefail - -project_name='DAPO' -exp_name='dapo_qwen2-7B-math_28k_fsdp2_fully-async_16-16_mbs32_tpf16_fixmcs' - -# Ray -MODEL_PATH=/mnt/dolphinfs/ssd_pool/docker/user/hadoop-friday-studio/FTI/houzhenggang/model/Qwen2___5-Math-7B -CKPTS_DIR=./ckpts/${project_name}/${exp_name} -TRAIN_FILE=/mnt/dolphinfs/ssd_pool/docker/user/hadoop-friday-studio/FTI/houzhenggang/data/dapo/dapo-math-17k.parquet -TEST_FILE=/mnt/dolphinfs/ssd_pool/docker/user/hadoop-friday-studio/FTI/houzhenggang/data/dapo/aime-2024.parquet - -rollout_mode="async" -rollout_name="vllm" # sglang or vllm -if [ "$rollout_mode" = "async" ]; then - export VLLM_USE_V1=1 - return_raw_chat="True" -fi - -# Algorithm parameters -adv_estimator=grpo - -use_kl_in_reward=False -kl_coef=0.0 -use_kl_loss=False -kl_loss_coef=0.0 - -clip_ratio_low=0.2 -clip_ratio_high=0.28 - -# Response length parameters -max_prompt_length=$((1024 * 2)) -max_response_length=$((1024 * 28)) -enable_overlong_buffer=True -overlong_buffer_len=$((1024 * 4)) -overlong_penalty_factor=1.0 - -# Training parameters -loss_agg_mode="token-mean" - -# Algorithm -temperature=1.0 -top_p=1.0 -top_k=-1 # 0 for HF rollout, -1 for vLLM rollout -val_top_p=0.7 - -# Performance Related Parameter -use_dynamic_bsz=True -actor_ppo_max_token_len=$(((max_prompt_length + max_response_length) * 2)) -infer_ppo_max_token_len=$(((max_prompt_length + max_response_length) * 3)) -ref_offload=True -actor_offload=False -gen_tp=4 -sp_size=4 -fsdp_size=8 - -# Fully async specific parameters -NNODES_ROLLOUT=${NNODES_ROLLOUT:-2} -NNODES_TRAIN=${NNODES_TRAIN:-2} -NGPUS_PER_NODE=${NGPUS_PER_NODE:-8} - - -train_prompt_bsz=0 -gen_prompt_bsz=1 -n_resp_per_prompt=16 -train_prompt_mini_bsz=32 -total_rollout_steps=$(((512*400))) -test_freq=20 -staleness_threshold=0.1 -trigger_parameter_sync_step=16 -partial_rollout=True - -python -m recipe.fully_async_policy.fully_async_main \ - data.train_files="${TRAIN_FILE}" \ - data.val_files="${TEST_FILE}" \ - data.prompt_key=prompt \ - data.truncation='left' \ - data.max_prompt_length=${max_prompt_length} \ - data.max_response_length=${max_response_length} \ - data.train_batch_size=${train_prompt_bsz} \ - data.gen_batch_size=${gen_prompt_bsz} \ - data.return_raw_chat=${return_raw_chat} \ - actor_rollout_ref.rollout.n=${n_resp_per_prompt} \ - algorithm.adv_estimator=${adv_estimator} \ - algorithm.use_kl_in_reward=${use_kl_in_reward} \ - algorithm.kl_ctrl.kl_coef=${kl_coef} \ - actor_rollout_ref.actor.strategy=fsdp2 \ - critic.strategy=fsdp2 \ - actor_rollout_ref.actor.use_kl_loss=${use_kl_loss} \ - actor_rollout_ref.actor.kl_loss_coef=${kl_loss_coef} \ - actor_rollout_ref.actor.clip_ratio_low=${clip_ratio_low} \ - actor_rollout_ref.actor.clip_ratio_high=${clip_ratio_high} \ - actor_rollout_ref.actor.clip_ratio_c=10.0 \ - actor_rollout_ref.model.use_remove_padding=True \ - actor_rollout_ref.hybrid_engine=False \ - +actor_rollout_ref.model.override_config.max_position_embeddings=32768 \ - actor_rollout_ref.actor.use_dynamic_bsz=${use_dynamic_bsz} \ - actor_rollout_ref.ref.log_prob_use_dynamic_bsz=${use_dynamic_bsz} \ - actor_rollout_ref.rollout.log_prob_use_dynamic_bsz=${use_dynamic_bsz} \ - actor_rollout_ref.actor.ppo_max_token_len_per_gpu=${actor_ppo_max_token_len} \ - actor_rollout_ref.ref.log_prob_max_token_len_per_gpu=${infer_ppo_max_token_len} \ - actor_rollout_ref.rollout.log_prob_max_token_len_per_gpu=${infer_ppo_max_token_len} \ - actor_rollout_ref.model.path="${MODEL_PATH}" \ - actor_rollout_ref.actor.optim.lr=1e-6 \ - actor_rollout_ref.actor.optim.lr_warmup_steps=10 \ - actor_rollout_ref.actor.optim.weight_decay=0.1 \ - actor_rollout_ref.actor.ppo_mini_batch_size=${train_prompt_mini_bsz} \ - actor_rollout_ref.actor.fsdp_config.param_offload=${actor_offload} \ - actor_rollout_ref.actor.fsdp_config.optimizer_offload=${actor_offload} \ - actor_rollout_ref.actor.entropy_coeff=0 \ - actor_rollout_ref.actor.grad_clip=1.0 \ - actor_rollout_ref.actor.loss_agg_mode=${loss_agg_mode} \ - actor_rollout_ref.actor.ulysses_sequence_parallel_size=${sp_size} \ - actor_rollout_ref.rollout.gpu_memory_utilization=0.80 \ - actor_rollout_ref.rollout.tensor_model_parallel_size=${gen_tp} \ - actor_rollout_ref.rollout.enable_chunked_prefill=True \ - actor_rollout_ref.rollout.max_num_batched_tokens=$((max_prompt_length + max_response_length)) \ - actor_rollout_ref.rollout.temperature=${temperature} \ - actor_rollout_ref.rollout.top_p=${top_p} \ - actor_rollout_ref.rollout.top_k=${top_k} \ - actor_rollout_ref.rollout.val_kwargs.temperature=${temperature} \ - actor_rollout_ref.rollout.val_kwargs.top_p=${val_top_p} \ - actor_rollout_ref.rollout.val_kwargs.top_k=${top_k} \ - actor_rollout_ref.rollout.val_kwargs.do_sample=True \ - actor_rollout_ref.rollout.val_kwargs.n=1 \ - actor_rollout_ref.ref.fsdp_config.param_offload=${ref_offload} \ - actor_rollout_ref.ref.ulysses_sequence_parallel_size=${sp_size} \ - actor_rollout_ref.actor.fsdp_config.fsdp_size=${fsdp_size} \ - actor_rollout_ref.rollout.name=${rollout_name} \ - actor_rollout_ref.rollout.mode=${rollout_mode} \ - actor_rollout_ref.rollout.calculate_log_probs=True \ - reward_model.reward_manager=dapo \ - +reward_model.reward_kwargs.overlong_buffer_cfg.enable=${enable_overlong_buffer} \ - +reward_model.reward_kwargs.overlong_buffer_cfg.len=${overlong_buffer_len} \ - +reward_model.reward_kwargs.overlong_buffer_cfg.penalty_factor=${overlong_penalty_factor} \ - +reward_model.reward_kwargs.overlong_buffer_cfg.log=False \ - +reward_model.reward_kwargs.max_resp_len=${max_response_length} \ - trainer.logger=['console','tensorboard'] \ - trainer.project_name="${project_name}" \ - trainer.experiment_name="${exp_name}" \ - trainer.val_before_train=True \ - trainer.save_freq=-1 \ - trainer.default_local_dir="${CKPTS_DIR}" \ - trainer.resume_mode=auto \ - trainer.nnodes="${NNODES_TRAIN}" \ - trainer.n_gpus_per_node="${NGPUS_PER_NODE}" \ - rollout.nnodes="${NNODES_ROLLOUT}" \ - rollout.n_gpus_per_node="${NGPUS_PER_NODE}" \ - rollout.total_rollout_steps="${total_rollout_steps}" \ - rollout.total_epochs=10 \ - rollout.test_freq="${test_freq}" \ - async_training.staleness_threshold="${staleness_threshold}" \ - async_training.trigger_parameter_sync_step="${trigger_parameter_sync_step}" \ - async_training.partial_rollout="${partial_rollout}" diff --git a/recipe/fully_async_policy/exp/qwen2-7B-math_32/fsdp2_fully-async_16-16/runtime_env.yaml b/recipe/fully_async_policy/exp/qwen2-7B-math_32/fsdp2_fully-async_16-16/runtime_env.yaml deleted file mode 100644 index 5f0292d2c0d..00000000000 --- a/recipe/fully_async_policy/exp/qwen2-7B-math_32/fsdp2_fully-async_16-16/runtime_env.yaml +++ /dev/null @@ -1,4 +0,0 @@ -env_vars: - VLLM_USE_V1: "1" - TENSORBOARD_DIR: "/mnt/dolphinfs/ssd_pool/docker/user/hadoop-friday-studio/FTI/houzhenggang/expeirments/qwen2-7B-math/dapo_qwen2-7B-math_28k_fsdp2_fully-async_16-16_mbs32_tpf16-fsdpsize_8" - HYDRA_FULL_ERROR: "1" \ No newline at end of file diff --git a/recipe/fully_async_policy/exp/qwen2-7B-math_32/fsdp2_fully-async_24-8/run.sh b/recipe/fully_async_policy/exp/qwen2-7B-math_32/fsdp2_fully-async_24-8/run.sh deleted file mode 100644 index 3de9279a9bc..00000000000 --- a/recipe/fully_async_policy/exp/qwen2-7B-math_32/fsdp2_fully-async_24-8/run.sh +++ /dev/null @@ -1,168 +0,0 @@ -#!/usr/bin/env bash -set -xeuo pipefail - -project_name='DAPO' -exp_name='dapo_qwen2-7B-math_28k_fsdp2_fully-async_24-8_mbs32_tpf32' - -# Ray -# RAY_ADDRESS=${RAY_ADDRESS:-"http://localhost:8265"} -# WORKING_DIR=${WORKING_DIR:-"${PWD}"} -# RUNTIME_ENV=${RUNTIME_ENV:-"${WORKING_DIR}/verl/trainer/runtime_env.yaml"} -# Paths -RAY_DATA_HOME=${RAY_DATA_HOME:-"${HOME}/verl"} -# very important! please modify the max_position_embeddings in config.json to 32768 after downloading from huggingface -MODEL_PATH=${MODEL_PATH:-"${RAY_DATA_HOME}/models/Qwen2.5-Math-7B"} -CKPTS_DIR=${CKPTS_DIR:-"${RAY_DATA_HOME}/ckpts/${project_name}/${exp_name}"} -TRAIN_FILE=${TRAIN_FILE:-"${RAY_DATA_HOME}/data/dapo-math-17k.parquet"} -TEST_FILE=${TEST_FILE:-"${RAY_DATA_HOME}/data/aime-2024.parquet"} - -# MODEL_PATH=/mnt/dolphinfs/hdd_pool/docker/user/hadoop-djst-algoplat/houzhenggang/modelscope/hub/models/Qwen/Qwen2___5-Math-7B -MODEL_PATH=/mnt/dolphinfs/ssd_pool/docker/user/hadoop-friday-studio/FTI/houzhenggang/model/Qwen2___5-Math-7B -CKPTS_DIR=./ckpts/${project_name}/${exp_name} -# TRAIN_FILE=/mnt/dolphinfs/hdd_pool/docker/user/hadoop-djst-algoplat/houzhenggang/data/dapo/dapo-math-17k.parquet -# TEST_FILE=/mnt/dolphinfs/hdd_pool/docker/user/hadoop-djst-algoplat/houzhenggang/data/dapo/aime-2024.parquet -TRAIN_FILE=/mnt/dolphinfs/ssd_pool/docker/user/hadoop-friday-studio/FTI/houzhenggang/data/dapo/dapo-math-17k.parquet -TEST_FILE=/mnt/dolphinfs/ssd_pool/docker/user/hadoop-friday-studio/FTI/houzhenggang/data/dapo/aime-2024.parquet - -rollout_mode="async" -rollout_name="vllm" # sglang or vllm -if [ "$rollout_mode" = "async" ]; then - export VLLM_USE_V1=1 - return_raw_chat="True" -fi - -# Algorithm parameters -adv_estimator=grpo - -use_kl_in_reward=False -kl_coef=0.0 -use_kl_loss=False -kl_loss_coef=0.0 - -clip_ratio_low=0.2 -clip_ratio_high=0.28 - -# Response length parameters -max_prompt_length=$((1024 * 2)) -max_response_length=$((1024 * 28)) -enable_overlong_buffer=True -overlong_buffer_len=$((1024 * 4)) -overlong_penalty_factor=1.0 - -# Training parameters -loss_agg_mode="token-mean" - -# Algorithm -temperature=1.0 -top_p=1.0 -top_k=-1 # 0 for HF rollout, -1 for vLLM rollout -val_top_p=0.7 - -# Performance Related Parameter -use_dynamic_bsz=True -actor_ppo_max_token_len=$(((max_prompt_length + max_response_length) * 2)) -infer_ppo_max_token_len=$(((max_prompt_length + max_response_length) * 3)) -ref_offload=True -actor_offload=False -gen_tp=4 -sp_size=4 -fsdp_size=2 - -# Fully async specific parameters -NNODES_ROLLOUT=${NNODES_ROLLOUT:-3} -NNODES_TRAIN=${NNODES_TRAIN:-1} -NGPUS_PER_NODE=${NGPUS_PER_NODE:-8} - - -train_prompt_bsz=0 -gen_prompt_bsz=1 -n_resp_per_prompt=16 -train_prompt_mini_bsz=32 -total_rollout_steps=$(((512*400))) -test_freq=20 -staleness_threshold=0.1 -trigger_parameter_sync_step=32 -partial_rollout=True - -python -m recipe.fully_async_policy.fully_async_main \ - data.train_files="${TRAIN_FILE}" \ - data.val_files="${TEST_FILE}" \ - data.prompt_key=prompt \ - data.truncation='left' \ - data.max_prompt_length=${max_prompt_length} \ - data.max_response_length=${max_response_length} \ - data.train_batch_size=${train_prompt_bsz} \ - data.gen_batch_size=${gen_prompt_bsz} \ - data.return_raw_chat=${return_raw_chat} \ - actor_rollout_ref.rollout.n=${n_resp_per_prompt} \ - algorithm.adv_estimator=${adv_estimator} \ - algorithm.use_kl_in_reward=${use_kl_in_reward} \ - algorithm.kl_ctrl.kl_coef=${kl_coef} \ - actor_rollout_ref.actor.strategy=fsdp2 \ - critic.strategy=fsdp2 \ - actor_rollout_ref.actor.use_kl_loss=${use_kl_loss} \ - actor_rollout_ref.actor.kl_loss_coef=${kl_loss_coef} \ - actor_rollout_ref.actor.clip_ratio_low=${clip_ratio_low} \ - actor_rollout_ref.actor.clip_ratio_high=${clip_ratio_high} \ - actor_rollout_ref.actor.clip_ratio_c=10.0 \ - actor_rollout_ref.model.use_remove_padding=True \ - actor_rollout_ref.hybrid_engine=False \ - +actor_rollout_ref.model.override_config.max_position_embeddings=32768 \ - actor_rollout_ref.actor.use_dynamic_bsz=${use_dynamic_bsz} \ - actor_rollout_ref.ref.log_prob_use_dynamic_bsz=${use_dynamic_bsz} \ - actor_rollout_ref.rollout.log_prob_use_dynamic_bsz=${use_dynamic_bsz} \ - actor_rollout_ref.actor.ppo_max_token_len_per_gpu=${actor_ppo_max_token_len} \ - actor_rollout_ref.ref.log_prob_max_token_len_per_gpu=${infer_ppo_max_token_len} \ - actor_rollout_ref.rollout.log_prob_max_token_len_per_gpu=${infer_ppo_max_token_len} \ - actor_rollout_ref.model.path="${MODEL_PATH}" \ - actor_rollout_ref.actor.optim.lr=1e-6 \ - actor_rollout_ref.actor.optim.lr_warmup_steps=10 \ - actor_rollout_ref.actor.optim.weight_decay=0.1 \ - actor_rollout_ref.actor.ppo_mini_batch_size=${train_prompt_mini_bsz} \ - actor_rollout_ref.actor.fsdp_config.param_offload=${actor_offload} \ - actor_rollout_ref.actor.fsdp_config.optimizer_offload=${actor_offload} \ - actor_rollout_ref.actor.entropy_coeff=0 \ - actor_rollout_ref.actor.grad_clip=1.0 \ - actor_rollout_ref.actor.loss_agg_mode=${loss_agg_mode} \ - actor_rollout_ref.actor.ulysses_sequence_parallel_size=${sp_size} \ - actor_rollout_ref.rollout.gpu_memory_utilization=0.80 \ - actor_rollout_ref.rollout.tensor_model_parallel_size=${gen_tp} \ - actor_rollout_ref.rollout.enable_chunked_prefill=True \ - actor_rollout_ref.rollout.max_num_batched_tokens=$((max_prompt_length + max_response_length)) \ - actor_rollout_ref.rollout.temperature=${temperature} \ - actor_rollout_ref.rollout.top_p=${top_p} \ - actor_rollout_ref.rollout.top_k=${top_k} \ - actor_rollout_ref.rollout.val_kwargs.temperature=${temperature} \ - actor_rollout_ref.rollout.val_kwargs.top_p=${val_top_p} \ - actor_rollout_ref.rollout.val_kwargs.top_k=${top_k} \ - actor_rollout_ref.rollout.val_kwargs.do_sample=True \ - actor_rollout_ref.rollout.val_kwargs.n=1 \ - actor_rollout_ref.ref.fsdp_config.param_offload=${ref_offload} \ - actor_rollout_ref.ref.ulysses_sequence_parallel_size=${sp_size} \ - actor_rollout_ref.actor.fsdp_config.fsdp_size=${fsdp_size} \ - actor_rollout_ref.rollout.name=${rollout_name} \ - actor_rollout_ref.rollout.mode=${rollout_mode} \ - actor_rollout_ref.rollout.calculate_log_probs=True \ - reward_model.reward_manager=dapo \ - +reward_model.reward_kwargs.overlong_buffer_cfg.enable=${enable_overlong_buffer} \ - +reward_model.reward_kwargs.overlong_buffer_cfg.len=${overlong_buffer_len} \ - +reward_model.reward_kwargs.overlong_buffer_cfg.penalty_factor=${overlong_penalty_factor} \ - +reward_model.reward_kwargs.overlong_buffer_cfg.log=False \ - +reward_model.reward_kwargs.max_resp_len=${max_response_length} \ - trainer.logger=['console','tensorboard'] \ - trainer.project_name="${project_name}" \ - trainer.experiment_name="${exp_name}" \ - trainer.val_before_train=True \ - trainer.save_freq=-1 \ - trainer.default_local_dir="${CKPTS_DIR}" \ - trainer.resume_mode=auto \ - trainer.nnodes="${NNODES_TRAIN}" \ - trainer.n_gpus_per_node="${NGPUS_PER_NODE}" \ - rollout.nnodes="${NNODES_ROLLOUT}" \ - rollout.n_gpus_per_node="${NGPUS_PER_NODE}" \ - rollout.total_rollout_steps="${total_rollout_steps}" \ - rollout.total_epochs=10 \ - rollout.test_freq="${test_freq}" \ - async_training.staleness_threshold="${staleness_threshold}" \ - async_training.trigger_parameter_sync_step="${trigger_parameter_sync_step}" \ - async_training.partial_rollout="${partial_rollout}" diff --git a/recipe/fully_async_policy/exp/qwen2-7B-math_32/fsdp2_fully-async_24-8/runtime_env.yaml b/recipe/fully_async_policy/exp/qwen2-7B-math_32/fsdp2_fully-async_24-8/runtime_env.yaml deleted file mode 100644 index 7402c1b37b0..00000000000 --- a/recipe/fully_async_policy/exp/qwen2-7B-math_32/fsdp2_fully-async_24-8/runtime_env.yaml +++ /dev/null @@ -1,5 +0,0 @@ -env_vars: - VLLM_USE_V1: "1" - TENSORBOARD_DIR: "/mnt/dolphinfs/ssd_pool/docker/user/hadoop-friday-studio/FTI/houzhenggang/expeirments/qwen2-7B-math/dapo_qwen2-7B-math_28k_fsdp2_fully-async_24-8_mbs32_tpf32" - NCCL_DEBUG: "INFO" - HYDRA_FULL_ERROR: "1" \ No newline at end of file diff --git a/recipe/fully_async_policy/exp/qwen2-7B-math_32/fsdp2_fully-async_8-24/run.sh b/recipe/fully_async_policy/exp/qwen2-7B-math_32/fsdp2_fully-async_8-24/run.sh deleted file mode 100644 index 4ba49146329..00000000000 --- a/recipe/fully_async_policy/exp/qwen2-7B-math_32/fsdp2_fully-async_8-24/run.sh +++ /dev/null @@ -1,168 +0,0 @@ -#!/usr/bin/env bash -set -xeuo pipefail - -project_name='DAPO' -exp_name='dapo_qwen2-7B-math_28k_fsdp2_fully-async_8-24_mbs32_tpf11' - -# Ray -# RAY_ADDRESS=${RAY_ADDRESS:-"http://localhost:8265"} -# WORKING_DIR=${WORKING_DIR:-"${PWD}"} -# RUNTIME_ENV=${RUNTIME_ENV:-"${WORKING_DIR}/verl/trainer/runtime_env.yaml"} -# Paths -RAY_DATA_HOME=${RAY_DATA_HOME:-"${HOME}/verl"} -# very important! please modify the max_position_embeddings in config.json to 32768 after downloading from huggingface -MODEL_PATH=${MODEL_PATH:-"${RAY_DATA_HOME}/models/Qwen2.5-Math-7B"} -CKPTS_DIR=${CKPTS_DIR:-"${RAY_DATA_HOME}/ckpts/${project_name}/${exp_name}"} -TRAIN_FILE=${TRAIN_FILE:-"${RAY_DATA_HOME}/data/dapo-math-17k.parquet"} -TEST_FILE=${TEST_FILE:-"${RAY_DATA_HOME}/data/aime-2024.parquet"} - -# MODEL_PATH=/mnt/dolphinfs/hdd_pool/docker/user/hadoop-djst-algoplat/houzhenggang/modelscope/hub/models/Qwen/Qwen2___5-Math-7B -MODEL_PATH=/mnt/dolphinfs/ssd_pool/docker/user/hadoop-friday-studio/FTI/houzhenggang/model/Qwen2___5-Math-7B -CKPTS_DIR=./ckpts/${project_name}/${exp_name} -# TRAIN_FILE=/mnt/dolphinfs/hdd_pool/docker/user/hadoop-djst-algoplat/houzhenggang/data/dapo/dapo-math-17k.parquet -# TEST_FILE=/mnt/dolphinfs/hdd_pool/docker/user/hadoop-djst-algoplat/houzhenggang/data/dapo/aime-2024.parquet -TRAIN_FILE=/mnt/dolphinfs/ssd_pool/docker/user/hadoop-friday-studio/FTI/houzhenggang/data/dapo/dapo-math-17k.parquet -TEST_FILE=/mnt/dolphinfs/ssd_pool/docker/user/hadoop-friday-studio/FTI/houzhenggang/data/dapo/aime-2024.parquet - -rollout_mode="async" -rollout_name="vllm" # sglang or vllm -if [ "$rollout_mode" = "async" ]; then - export VLLM_USE_V1=1 - return_raw_chat="True" -fi - -# Algorithm parameters -adv_estimator=grpo - -use_kl_in_reward=False -kl_coef=0.0 -use_kl_loss=False -kl_loss_coef=0.0 - -clip_ratio_low=0.2 -clip_ratio_high=0.28 - -# Response length parameters -max_prompt_length=$((1024 * 2)) -max_response_length=$((1024 * 28)) -enable_overlong_buffer=True -overlong_buffer_len=$((1024 * 4)) -overlong_penalty_factor=1.0 - -# Training parameters -loss_agg_mode="token-mean" - -# Algorithm -temperature=1.0 -top_p=1.0 -top_k=-1 # 0 for HF rollout, -1 for vLLM rollout -val_top_p=0.7 - -# Performance Related Parameter -use_dynamic_bsz=True -actor_ppo_max_token_len=$(((max_prompt_length + max_response_length) * 2)) -infer_ppo_max_token_len=$(((max_prompt_length + max_response_length) * 3)) -ref_offload=True -actor_offload=False -gen_tp=4 -sp_size=4 -fsdp_size=2 - -# Fully async specific parameters -NNODES_ROLLOUT=${NNODES_ROLLOUT:-1} -NNODES_TRAIN=${NNODES_TRAIN:-3} -NGPUS_PER_NODE=${NGPUS_PER_NODE:-8} - - -train_prompt_bsz=0 -gen_prompt_bsz=1 -n_resp_per_prompt=16 -train_prompt_mini_bsz=32 -total_rollout_steps=$(((512*400))) -test_freq=20 -staleness_threshold=0.1 -trigger_parameter_sync_step=11 -partial_rollout=True - -python -m recipe.fully_async_policy.fully_async_main \ - data.train_files="${TRAIN_FILE}" \ - data.val_files="${TEST_FILE}" \ - data.prompt_key=prompt \ - data.truncation='left' \ - data.max_prompt_length=${max_prompt_length} \ - data.max_response_length=${max_response_length} \ - data.train_batch_size=${train_prompt_bsz} \ - data.gen_batch_size=${gen_prompt_bsz} \ - data.return_raw_chat=${return_raw_chat} \ - actor_rollout_ref.rollout.n=${n_resp_per_prompt} \ - algorithm.adv_estimator=${adv_estimator} \ - algorithm.use_kl_in_reward=${use_kl_in_reward} \ - algorithm.kl_ctrl.kl_coef=${kl_coef} \ - actor_rollout_ref.actor.strategy=fsdp2 \ - critic.strategy=fsdp2 \ - actor_rollout_ref.actor.use_kl_loss=${use_kl_loss} \ - actor_rollout_ref.actor.kl_loss_coef=${kl_loss_coef} \ - actor_rollout_ref.actor.clip_ratio_low=${clip_ratio_low} \ - actor_rollout_ref.actor.clip_ratio_high=${clip_ratio_high} \ - actor_rollout_ref.actor.clip_ratio_c=10.0 \ - actor_rollout_ref.model.use_remove_padding=True \ - actor_rollout_ref.hybrid_engine=False \ - +actor_rollout_ref.model.override_config.max_position_embeddings=32768 \ - actor_rollout_ref.actor.use_dynamic_bsz=${use_dynamic_bsz} \ - actor_rollout_ref.ref.log_prob_use_dynamic_bsz=${use_dynamic_bsz} \ - actor_rollout_ref.rollout.log_prob_use_dynamic_bsz=${use_dynamic_bsz} \ - actor_rollout_ref.actor.ppo_max_token_len_per_gpu=${actor_ppo_max_token_len} \ - actor_rollout_ref.ref.log_prob_max_token_len_per_gpu=${infer_ppo_max_token_len} \ - actor_rollout_ref.rollout.log_prob_max_token_len_per_gpu=${infer_ppo_max_token_len} \ - actor_rollout_ref.model.path="${MODEL_PATH}" \ - actor_rollout_ref.actor.optim.lr=1e-6 \ - actor_rollout_ref.actor.optim.lr_warmup_steps=10 \ - actor_rollout_ref.actor.optim.weight_decay=0.1 \ - actor_rollout_ref.actor.ppo_mini_batch_size=${train_prompt_mini_bsz} \ - actor_rollout_ref.actor.fsdp_config.param_offload=${actor_offload} \ - actor_rollout_ref.actor.fsdp_config.optimizer_offload=${actor_offload} \ - actor_rollout_ref.actor.entropy_coeff=0 \ - actor_rollout_ref.actor.grad_clip=1.0 \ - actor_rollout_ref.actor.loss_agg_mode=${loss_agg_mode} \ - actor_rollout_ref.actor.ulysses_sequence_parallel_size=${sp_size} \ - actor_rollout_ref.rollout.gpu_memory_utilization=0.80 \ - actor_rollout_ref.rollout.tensor_model_parallel_size=${gen_tp} \ - actor_rollout_ref.rollout.enable_chunked_prefill=True \ - actor_rollout_ref.rollout.max_num_batched_tokens=$((max_prompt_length + max_response_length)) \ - actor_rollout_ref.rollout.temperature=${temperature} \ - actor_rollout_ref.rollout.top_p=${top_p} \ - actor_rollout_ref.rollout.top_k=${top_k} \ - actor_rollout_ref.rollout.val_kwargs.temperature=${temperature} \ - actor_rollout_ref.rollout.val_kwargs.top_p=${val_top_p} \ - actor_rollout_ref.rollout.val_kwargs.top_k=${top_k} \ - actor_rollout_ref.rollout.val_kwargs.do_sample=True \ - actor_rollout_ref.rollout.val_kwargs.n=1 \ - actor_rollout_ref.ref.fsdp_config.param_offload=${ref_offload} \ - actor_rollout_ref.ref.ulysses_sequence_parallel_size=${sp_size} \ - actor_rollout_ref.actor.fsdp_config.fsdp_size=${fsdp_size} \ - actor_rollout_ref.rollout.name=${rollout_name} \ - actor_rollout_ref.rollout.mode=${rollout_mode} \ - actor_rollout_ref.rollout.calculate_log_probs=True \ - reward_model.reward_manager=dapo \ - +reward_model.reward_kwargs.overlong_buffer_cfg.enable=${enable_overlong_buffer} \ - +reward_model.reward_kwargs.overlong_buffer_cfg.len=${overlong_buffer_len} \ - +reward_model.reward_kwargs.overlong_buffer_cfg.penalty_factor=${overlong_penalty_factor} \ - +reward_model.reward_kwargs.overlong_buffer_cfg.log=False \ - +reward_model.reward_kwargs.max_resp_len=${max_response_length} \ - trainer.logger=['console','tensorboard'] \ - trainer.project_name="${project_name}" \ - trainer.experiment_name="${exp_name}" \ - trainer.val_before_train=True \ - trainer.save_freq=-1 \ - trainer.default_local_dir="${CKPTS_DIR}" \ - trainer.resume_mode=auto \ - trainer.nnodes="${NNODES_TRAIN}" \ - trainer.n_gpus_per_node="${NGPUS_PER_NODE}" \ - rollout.nnodes="${NNODES_ROLLOUT}" \ - rollout.n_gpus_per_node="${NGPUS_PER_NODE}" \ - rollout.total_rollout_steps="${total_rollout_steps}" \ - rollout.total_epochs=10 \ - rollout.test_freq="${test_freq}" \ - async_training.staleness_threshold="${staleness_threshold}" \ - async_training.trigger_parameter_sync_step="${trigger_parameter_sync_step}" \ - async_training.partial_rollout="${partial_rollout}" diff --git a/recipe/fully_async_policy/exp/qwen2-7B-math_32/fsdp2_fully-async_8-24/runtime_env.yaml b/recipe/fully_async_policy/exp/qwen2-7B-math_32/fsdp2_fully-async_8-24/runtime_env.yaml deleted file mode 100644 index fc404cfd985..00000000000 --- a/recipe/fully_async_policy/exp/qwen2-7B-math_32/fsdp2_fully-async_8-24/runtime_env.yaml +++ /dev/null @@ -1,5 +0,0 @@ -env_vars: - VLLM_USE_V1: "1" - TENSORBOARD_DIR: "/mnt/dolphinfs/ssd_pool/docker/user/hadoop-friday-studio/FTI/houzhenggang/expeirments/qwen2-7B-math/dapo_qwen2-7B-math_28k_fsdp2_fully-async_8-24_mbs32_tpf11" - NCCL_DEBUG: "INFO" - HYDRA_FULL_ERROR: "1" \ No newline at end of file diff --git a/recipe/fully_async_policy/exp/qwen2-7B-math_32/megatron_colocate/run.sh b/recipe/fully_async_policy/exp/qwen2-7B-math_32/megatron_colocate/run.sh deleted file mode 100644 index 3879a99df67..00000000000 --- a/recipe/fully_async_policy/exp/qwen2-7B-math_32/megatron_colocate/run.sh +++ /dev/null @@ -1,135 +0,0 @@ -#!/usr/bin/env bash -set -xeuo pipefail - -project_name='DAPO' -exp_name='dapo_qwen2-7B-math_28k_megatron_colocate_32_mbs32' - -adv_estimator=grpo - -use_kl_in_reward=False -kl_coef=0.0 -use_kl_loss=False -kl_loss_coef=0.0 - -clip_ratio_low=0.2 -clip_ratio_high=0.28 - -max_prompt_length=$((1024 * 2)) -max_response_length=$((1024 * 28)) -enable_overlong_buffer=True -overlong_buffer_len=$((1024 * 4)) -overlong_penalty_factor=1.0 - -loss_agg_mode="token-mean" - -train_prompt_bsz=512 -n_resp_per_prompt=16 -train_prompt_mini_bsz=32 - -# Ray -# RAY_ADDRESS=${RAY_ADDRESS:-"http://localhost:8265"} -# WORKING_DIR=${WORKING_DIR:-"${PWD}"} -# RUNTIME_ENV=${RUNTIME_ENV:-"${WORKING_DIR}/verl/trainer/runtime_env.yaml"} -NNODES=${NNODES:-4} -NGPUS_PER_NODE=${NGPUS_PER_NODE:-8} -# Paths -MODEL_PATH=/mnt/dolphinfs/ssd_pool/docker/user/hadoop-friday-studio/FTI/houzhenggang/model/Qwen2___5-Math-7B -CKPTS_DIR=./ckpts/${project_name}/${exp_name} -TRAIN_FILE=/mnt/dolphinfs/ssd_pool/docker/user/hadoop-friday-studio/FTI/houzhenggang/data/dapo/dapo-math-17k.parquet -TEST_FILE=/mnt/dolphinfs/ssd_pool/docker/user/hadoop-friday-studio/FTI/houzhenggang/data/dapo/aime-2024.parquet - -# Algorithm -temperature=1.0 -top_p=1.0 -top_k=-1 # 0 for HF rollout, -1 for vLLM rollout -val_top_p=0.7 - -# Performance Related Parameter -use_dynamic_bsz=True -actor_ppo_max_token_len=$(((max_prompt_length + max_response_length) * 2)) -infer_ppo_max_token_len=$(((max_prompt_length + max_response_length) * 3)) -offload=True -gen_tp=4 -train_tp=4 -train_pp=2 - -# TODO: support dynamic_bsz for megatron -# actor_rollout_ref.actor.use_dynamic_bsz=${use_dynamic_bsz} \ -# actor_rollout_ref.ref.log_prob_use_dynamic_bsz=${use_dynamic_bsz} \ -# actor_rollout_ref.rollout.log_prob_use_dynamic_bsz=${use_dynamic_bsz} \ -# actor_rollout_ref.actor.ppo_max_token_len_per_gpu=${actor_ppo_max_token_len} \ -# actor_rollout_ref.ref.log_prob_max_token_len_per_gpu=${infer_ppo_max_token_len} \ -# actor_rollout_ref.rollout.log_prob_max_token_len_per_gpu=${infer_ppo_max_token_len} \ - -python3 -m verl.trainer.main_ppo \ - --config-path=config \ - --config-name='ppo_megatron_trainer.yaml' \ - data.train_files="${TRAIN_FILE}" \ - data.val_files="${TEST_FILE}" \ - data.prompt_key=prompt \ - data.truncation='left' \ - data.max_prompt_length=${max_prompt_length} \ - data.max_response_length=${max_response_length} \ - data.train_batch_size=${train_prompt_bsz} \ - actor_rollout_ref.rollout.n=${n_resp_per_prompt} \ - algorithm.adv_estimator=${adv_estimator} \ - algorithm.use_kl_in_reward=${use_kl_in_reward} \ - algorithm.kl_ctrl.kl_coef=${kl_coef} \ - actor_rollout_ref.actor.strategy=megatron \ - critic.strategy=megatron \ - actor_rollout_ref.actor.use_kl_loss=${use_kl_loss} \ - actor_rollout_ref.actor.kl_loss_coef=${kl_loss_coef} \ - actor_rollout_ref.actor.clip_ratio_low=${clip_ratio_low} \ - actor_rollout_ref.actor.clip_ratio_high=${clip_ratio_high} \ - actor_rollout_ref.actor.clip_ratio_c=10.0 \ - +actor_rollout_ref.model.override_config.max_position_embeddings=32768 \ - actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=2 \ - actor_rollout_ref.ref.log_prob_micro_batch_size_per_gpu=4 \ - actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu=4 \ - actor_rollout_ref.model.path="${MODEL_PATH}" \ - actor_rollout_ref.actor.optim.lr=1e-6 \ - actor_rollout_ref.actor.optim.lr_warmup_steps=10 \ - actor_rollout_ref.actor.optim.weight_decay=0.1 \ - actor_rollout_ref.actor.ppo_mini_batch_size=${train_prompt_mini_bsz} \ - actor_rollout_ref.actor.megatron.param_offload=${offload} \ - actor_rollout_ref.actor.megatron.optimizer_offload=${offload} \ - actor_rollout_ref.actor.megatron.grad_offload=${offload} \ - actor_rollout_ref.actor.megatron.pipeline_model_parallel_size=${train_pp} \ - actor_rollout_ref.actor.megatron.tensor_model_parallel_size=${train_tp} \ - actor_rollout_ref.actor.entropy_coeff=0 \ - actor_rollout_ref.actor.optim.clip_grad=1.0 \ - actor_rollout_ref.actor.loss_agg_mode=${loss_agg_mode} \ - actor_rollout_ref.rollout.gpu_memory_utilization=0.80 \ - actor_rollout_ref.rollout.tensor_model_parallel_size=${gen_tp} \ - actor_rollout_ref.rollout.enable_chunked_prefill=True \ - actor_rollout_ref.rollout.max_num_batched_tokens=$((max_prompt_length + max_response_length)) \ - actor_rollout_ref.rollout.temperature=${temperature} \ - actor_rollout_ref.rollout.top_p=${top_p} \ - actor_rollout_ref.rollout.top_k=${top_k} \ - actor_rollout_ref.rollout.val_kwargs.temperature=${temperature} \ - actor_rollout_ref.rollout.val_kwargs.top_p=${val_top_p} \ - actor_rollout_ref.rollout.val_kwargs.top_k=${top_k} \ - actor_rollout_ref.rollout.val_kwargs.do_sample=True \ - actor_rollout_ref.rollout.val_kwargs.n=1 \ - actor_rollout_ref.ref.megatron.pipeline_model_parallel_size=${train_pp} \ - actor_rollout_ref.ref.megatron.tensor_model_parallel_size=${train_tp} \ - actor_rollout_ref.ref.megatron.param_offload=${offload} \ - reward_model.reward_manager=dapo \ - +reward_model.reward_kwargs.overlong_buffer_cfg.enable=${enable_overlong_buffer} \ - +reward_model.reward_kwargs.overlong_buffer_cfg.len=${overlong_buffer_len} \ - +reward_model.reward_kwargs.overlong_buffer_cfg.penalty_factor=${overlong_penalty_factor} \ - +reward_model.reward_kwargs.overlong_buffer_cfg.log=False \ - +reward_model.reward_kwargs.max_resp_len=${max_response_length} \ - trainer.logger=['console','tensorboard'] \ - trainer.project_name="${project_name}" \ - trainer.experiment_name="${exp_name}" \ - trainer.n_gpus_per_node=8 \ - trainer.nnodes="${NNODES}" \ - trainer.val_before_train=True \ - trainer.test_freq=20 \ - trainer.save_freq=-1 \ - trainer.total_epochs=10 \ - trainer.total_training_steps=400 \ - trainer.default_local_dir="${CKPTS_DIR}" \ - trainer.resume_mode=auto \ - trainer.log_val_generations=10 diff --git a/recipe/fully_async_policy/exp/qwen2-7B-math_32/megatron_colocate/runtime_env.yaml b/recipe/fully_async_policy/exp/qwen2-7B-math_32/megatron_colocate/runtime_env.yaml deleted file mode 100644 index 3a35b4a52ad..00000000000 --- a/recipe/fully_async_policy/exp/qwen2-7B-math_32/megatron_colocate/runtime_env.yaml +++ /dev/null @@ -1,3 +0,0 @@ -env_vars: - TENSORBOARD_DIR: "/mnt/dolphinfs/ssd_pool/docker/user/hadoop-friday-studio/FTI/houzhenggang/expeirments/qwen2-7B-math/dapo_qwen2-7B-math_28k_megatron_colocate_32_mbs32" - HYDRA_FULL_ERROR: "1" \ No newline at end of file diff --git a/recipe/fully_async_policy/exp/qwen2-7B-math_64/fsdp2_colocate/run.sh b/recipe/fully_async_policy/exp/qwen2-7B-math_64/fsdp2_colocate/run.sh deleted file mode 100644 index e6ab551869d..00000000000 --- a/recipe/fully_async_policy/exp/qwen2-7B-math_64/fsdp2_colocate/run.sh +++ /dev/null @@ -1,133 +0,0 @@ -#!/usr/bin/env bash -set -xeuo pipefail - -project_name='DAPO' -exp_name='dapo_qwen2-7B-math_28k_fsdp2_colocate_64_mbs32' - -adv_estimator=grpo - -use_kl_in_reward=False -kl_coef=0.0 -use_kl_loss=False -kl_loss_coef=0.0 - -clip_ratio_low=0.2 -clip_ratio_high=0.28 - -max_prompt_length=$((1024 * 2)) -max_response_length=$((1024 * 28)) -enable_overlong_buffer=True -overlong_buffer_len=$((1024 * 4)) -overlong_penalty_factor=1.0 - -loss_agg_mode="token-mean" - -train_prompt_bsz=512 -n_resp_per_prompt=16 -train_prompt_mini_bsz=32 - -# Ray -# RAY_ADDRESS=${RAY_ADDRESS:-"http://localhost:8265"} -# WORKING_DIR=${WORKING_DIR:-"${PWD}"} -# RUNTIME_ENV=${RUNTIME_ENV:-"${WORKING_DIR}/verl/trainer/runtime_env.yaml"} -NNODES=${NNODES:-8} -NGPUS_PER_NODE=${NGPUS_PER_NODE:-8} -# Paths -RAY_DATA_HOME=${RAY_DATA_HOME:-"${HOME}/verl"} -# very important! please modify the max_position_embeddings in config.json to 32768 after downloading from huggingface - -MODEL_PATH=/mnt/dolphinfs/ssd_pool/docker/user/hadoop-friday-studio/FTI/houzhenggang/model/Qwen2___5-Math-7B -CKPTS_DIR=./ckpts/${project_name}/${exp_name} -TRAIN_FILE=/mnt/dolphinfs/ssd_pool/docker/user/hadoop-friday-studio/FTI/houzhenggang/data/dapo/dapo-math-17k.parquet -TEST_FILE=/mnt/dolphinfs/ssd_pool/docker/user/hadoop-friday-studio/FTI/houzhenggang/data/dapo/aime-2024.parquet -# Algorithm -temperature=1.0 -top_p=1.0 -top_k=-1 # 0 for HF rollout, -1 for vLLM rollout -val_top_p=0.7 - -# Performance Related Parameter -use_dynamic_bsz=True -actor_ppo_max_token_len=$(((max_prompt_length + max_response_length) * 2)) -infer_ppo_max_token_len=$(((max_prompt_length + max_response_length) * 3)) -offload=True -gen_tp=4 -sp_size=4 -fsdp_size=2 - -# reference run wandb: https://wandb.ai/verl-org/DAPO%20Reproduction%20on%20verl/runs/ow47vvon?nw=nwusertongyuxuan361 - - -python -m verl.trainer.main_ppo \ - data.train_files="${TRAIN_FILE}" \ - data.val_files="${TEST_FILE}" \ - data.prompt_key=prompt \ - data.truncation='left' \ - data.max_prompt_length=${max_prompt_length} \ - data.max_response_length=${max_response_length} \ - data.train_batch_size=${train_prompt_bsz} \ - actor_rollout_ref.rollout.n=${n_resp_per_prompt} \ - algorithm.adv_estimator=${adv_estimator} \ - algorithm.use_kl_in_reward=${use_kl_in_reward} \ - algorithm.kl_ctrl.kl_coef=${kl_coef} \ - actor_rollout_ref.actor.strategy=fsdp2 \ - critic.strategy=fsdp2 \ - actor_rollout_ref.actor.use_kl_loss=${use_kl_loss} \ - actor_rollout_ref.actor.kl_loss_coef=${kl_loss_coef} \ - actor_rollout_ref.actor.clip_ratio_low=${clip_ratio_low} \ - actor_rollout_ref.actor.clip_ratio_high=${clip_ratio_high} \ - actor_rollout_ref.actor.clip_ratio_c=10.0 \ - actor_rollout_ref.model.use_remove_padding=True \ - +actor_rollout_ref.model.override_config.max_position_embeddings=32768 \ - actor_rollout_ref.actor.use_dynamic_bsz=${use_dynamic_bsz} \ - actor_rollout_ref.ref.log_prob_use_dynamic_bsz=${use_dynamic_bsz} \ - actor_rollout_ref.rollout.log_prob_use_dynamic_bsz=${use_dynamic_bsz} \ - actor_rollout_ref.actor.ppo_max_token_len_per_gpu=${actor_ppo_max_token_len} \ - actor_rollout_ref.ref.log_prob_max_token_len_per_gpu=${infer_ppo_max_token_len} \ - actor_rollout_ref.rollout.log_prob_max_token_len_per_gpu=${infer_ppo_max_token_len} \ - actor_rollout_ref.model.path="${MODEL_PATH}" \ - actor_rollout_ref.model.enable_gradient_checkpointing=True \ - actor_rollout_ref.actor.optim.lr=1e-6 \ - actor_rollout_ref.actor.optim.lr_warmup_steps=10 \ - actor_rollout_ref.actor.optim.weight_decay=0.1 \ - actor_rollout_ref.actor.ppo_mini_batch_size=${train_prompt_mini_bsz} \ - actor_rollout_ref.actor.fsdp_config.param_offload=${offload} \ - actor_rollout_ref.actor.fsdp_config.optimizer_offload=${offload} \ - actor_rollout_ref.actor.entropy_coeff=0 \ - actor_rollout_ref.actor.grad_clip=1.0 \ - actor_rollout_ref.actor.loss_agg_mode=${loss_agg_mode} \ - actor_rollout_ref.actor.ulysses_sequence_parallel_size=${sp_size} \ - actor_rollout_ref.rollout.gpu_memory_utilization=0.80 \ - actor_rollout_ref.rollout.tensor_model_parallel_size=${gen_tp} \ - actor_rollout_ref.rollout.enable_chunked_prefill=True \ - actor_rollout_ref.rollout.max_num_batched_tokens=$((max_prompt_length + max_response_length)) \ - actor_rollout_ref.rollout.temperature=${temperature} \ - actor_rollout_ref.rollout.top_p=${top_p} \ - actor_rollout_ref.rollout.top_k=${top_k} \ - actor_rollout_ref.rollout.val_kwargs.temperature=${temperature} \ - actor_rollout_ref.rollout.val_kwargs.top_p=${val_top_p} \ - actor_rollout_ref.rollout.val_kwargs.top_k=${top_k} \ - actor_rollout_ref.rollout.val_kwargs.do_sample=True \ - actor_rollout_ref.rollout.val_kwargs.n=1 \ - actor_rollout_ref.ref.fsdp_config.param_offload=${offload} \ - actor_rollout_ref.ref.ulysses_sequence_parallel_size=${sp_size} \ - actor_rollout_ref.actor.fsdp_config.fsdp_size=${fsdp_size} \ - reward_model.reward_manager=dapo \ - +reward_model.reward_kwargs.overlong_buffer_cfg.enable=${enable_overlong_buffer} \ - +reward_model.reward_kwargs.overlong_buffer_cfg.len=${overlong_buffer_len} \ - +reward_model.reward_kwargs.overlong_buffer_cfg.penalty_factor=${overlong_penalty_factor} \ - +reward_model.reward_kwargs.overlong_buffer_cfg.log=False \ - +reward_model.reward_kwargs.max_resp_len=${max_response_length} \ - trainer.logger=['console','tensorboard'] \ - trainer.project_name="${project_name}" \ - trainer.experiment_name="${exp_name}" \ - trainer.n_gpus_per_node="${NGPUS_PER_NODE}" \ - trainer.nnodes="${NNODES}" \ - trainer.val_before_train=True \ - trainer.test_freq=20 \ - trainer.save_freq=-1 \ - trainer.total_epochs=10 \ - trainer.total_training_steps=400 \ - trainer.default_local_dir="${CKPTS_DIR}" \ - trainer.resume_mode=auto \ - trainer.log_val_generations=10 diff --git a/recipe/fully_async_policy/exp/qwen2-7B-math_64/fsdp2_colocate/runtime_env.yaml b/recipe/fully_async_policy/exp/qwen2-7B-math_64/fsdp2_colocate/runtime_env.yaml deleted file mode 100644 index 514ab9a73f0..00000000000 --- a/recipe/fully_async_policy/exp/qwen2-7B-math_64/fsdp2_colocate/runtime_env.yaml +++ /dev/null @@ -1,3 +0,0 @@ -env_vars: - TENSORBOARD_DIR: "/mnt/dolphinfs/ssd_pool/docker/user/hadoop-friday-studio/FTI/houzhenggang/expeirments/qwen2-7B-math-64/dapo_qwen2-7B-math_28k_fsdp2_colocate_64_mbs32" - HYDRA_FULL_ERROR: "1" \ No newline at end of file diff --git a/recipe/fully_async_policy/exp/qwen2-7B-math_64/fsdp2_fully-async_24-40/run.sh b/recipe/fully_async_policy/exp/qwen2-7B-math_64/fsdp2_fully-async_24-40/run.sh deleted file mode 100644 index 3d56ea8b403..00000000000 --- a/recipe/fully_async_policy/exp/qwen2-7B-math_64/fsdp2_fully-async_24-40/run.sh +++ /dev/null @@ -1,168 +0,0 @@ -#!/usr/bin/env bash -set -xeuo pipefail - -project_name='DAPO' -exp_name='dapo_qwen2-7B-math_28k_fsdp2_fully-async_24-40_mbs32_tpf6' - -# Ray -# RAY_ADDRESS=${RAY_ADDRESS:-"http://localhost:8265"} -# WORKING_DIR=${WORKING_DIR:-"${PWD}"} -# RUNTIME_ENV=${RUNTIME_ENV:-"${WORKING_DIR}/verl/trainer/runtime_env.yaml"} -# Paths -RAY_DATA_HOME=${RAY_DATA_HOME:-"${HOME}/verl"} -# very important! please modify the max_position_embeddings in config.json to 32768 after downloading from huggingface -MODEL_PATH=${MODEL_PATH:-"${RAY_DATA_HOME}/models/Qwen2.5-Math-7B"} -CKPTS_DIR=${CKPTS_DIR:-"${RAY_DATA_HOME}/ckpts/${project_name}/${exp_name}"} -TRAIN_FILE=${TRAIN_FILE:-"${RAY_DATA_HOME}/data/dapo-math-17k.parquet"} -TEST_FILE=${TEST_FILE:-"${RAY_DATA_HOME}/data/aime-2024.parquet"} - -# MODEL_PATH=/mnt/dolphinfs/hdd_pool/docker/user/hadoop-djst-algoplat/houzhenggang/modelscope/hub/models/Qwen/Qwen2___5-Math-7B -MODEL_PATH=/mnt/dolphinfs/ssd_pool/docker/user/hadoop-friday-studio/FTI/houzhenggang/model/Qwen2___5-Math-7B -CKPTS_DIR=./ckpts/${project_name}/${exp_name} -# TRAIN_FILE=/mnt/dolphinfs/hdd_pool/docker/user/hadoop-djst-algoplat/houzhenggang/data/dapo/dapo-math-17k.parquet -# TEST_FILE=/mnt/dolphinfs/hdd_pool/docker/user/hadoop-djst-algoplat/houzhenggang/data/dapo/aime-2024.parquet -TRAIN_FILE=/mnt/dolphinfs/ssd_pool/docker/user/hadoop-friday-studio/FTI/houzhenggang/data/dapo/dapo-math-17k.parquet -TEST_FILE=/mnt/dolphinfs/ssd_pool/docker/user/hadoop-friday-studio/FTI/houzhenggang/data/dapo/aime-2024.parquet - -rollout_mode="async" -rollout_name="vllm" # sglang or vllm -if [ "$rollout_mode" = "async" ]; then - export VLLM_USE_V1=1 - return_raw_chat="True" -fi - -# Algorithm parameters -adv_estimator=grpo - -use_kl_in_reward=False -kl_coef=0.0 -use_kl_loss=False -kl_loss_coef=0.0 - -clip_ratio_low=0.2 -clip_ratio_high=0.28 - -# Response length parameters -max_prompt_length=$((1024 * 2)) -max_response_length=$((1024 * 28)) -enable_overlong_buffer=True -overlong_buffer_len=$((1024 * 4)) -overlong_penalty_factor=1.0 - -# Training parameters -loss_agg_mode="token-mean" - -# Algorithm -temperature=1.0 -top_p=1.0 -top_k=-1 # 0 for HF rollout, -1 for vLLM rollout -val_top_p=0.7 - -# Performance Related Parameter -use_dynamic_bsz=True -actor_ppo_max_token_len=$(((max_prompt_length + max_response_length) * 2)) -infer_ppo_max_token_len=$(((max_prompt_length + max_response_length) * 3)) -ref_offload=True -actor_offload=False -gen_tp=4 -sp_size=4 -fsdp_size=2 - -# Fully async specific parameters -NNODES_ROLLOUT=${NNODES_ROLLOUT:-3} -NNODES_TRAIN=${NNODES_TRAIN:-5} -NGPUS_PER_NODE=${NGPUS_PER_NODE:-8} - - -train_prompt_bsz=0 -gen_prompt_bsz=1 -n_resp_per_prompt=16 -train_prompt_mini_bsz=32 -total_rollout_steps=$(((512*400))) -test_freq=20 -staleness_threshold=0.1 -trigger_parameter_sync_step=6 -partial_rollout=True - -python -m recipe.fully_async_policy.fully_async_main \ - data.train_files="${TRAIN_FILE}" \ - data.val_files="${TEST_FILE}" \ - data.prompt_key=prompt \ - data.truncation='left' \ - data.max_prompt_length=${max_prompt_length} \ - data.max_response_length=${max_response_length} \ - data.train_batch_size=${train_prompt_bsz} \ - data.gen_batch_size=${gen_prompt_bsz} \ - data.return_raw_chat=${return_raw_chat} \ - actor_rollout_ref.rollout.n=${n_resp_per_prompt} \ - algorithm.adv_estimator=${adv_estimator} \ - algorithm.use_kl_in_reward=${use_kl_in_reward} \ - algorithm.kl_ctrl.kl_coef=${kl_coef} \ - actor_rollout_ref.actor.strategy=fsdp2 \ - critic.strategy=fsdp2 \ - actor_rollout_ref.actor.use_kl_loss=${use_kl_loss} \ - actor_rollout_ref.actor.kl_loss_coef=${kl_loss_coef} \ - actor_rollout_ref.actor.clip_ratio_low=${clip_ratio_low} \ - actor_rollout_ref.actor.clip_ratio_high=${clip_ratio_high} \ - actor_rollout_ref.actor.clip_ratio_c=10.0 \ - actor_rollout_ref.model.use_remove_padding=True \ - actor_rollout_ref.hybrid_engine=False \ - +actor_rollout_ref.model.override_config.max_position_embeddings=32768 \ - actor_rollout_ref.actor.use_dynamic_bsz=${use_dynamic_bsz} \ - actor_rollout_ref.ref.log_prob_use_dynamic_bsz=${use_dynamic_bsz} \ - actor_rollout_ref.rollout.log_prob_use_dynamic_bsz=${use_dynamic_bsz} \ - actor_rollout_ref.actor.ppo_max_token_len_per_gpu=${actor_ppo_max_token_len} \ - actor_rollout_ref.ref.log_prob_max_token_len_per_gpu=${infer_ppo_max_token_len} \ - actor_rollout_ref.rollout.log_prob_max_token_len_per_gpu=${infer_ppo_max_token_len} \ - actor_rollout_ref.model.path="${MODEL_PATH}" \ - actor_rollout_ref.actor.optim.lr=1e-6 \ - actor_rollout_ref.actor.optim.lr_warmup_steps=10 \ - actor_rollout_ref.actor.optim.weight_decay=0.1 \ - actor_rollout_ref.actor.ppo_mini_batch_size=${train_prompt_mini_bsz} \ - actor_rollout_ref.actor.fsdp_config.param_offload=${actor_offload} \ - actor_rollout_ref.actor.fsdp_config.optimizer_offload=${actor_offload} \ - actor_rollout_ref.actor.entropy_coeff=0 \ - actor_rollout_ref.actor.grad_clip=1.0 \ - actor_rollout_ref.actor.loss_agg_mode=${loss_agg_mode} \ - actor_rollout_ref.actor.ulysses_sequence_parallel_size=${sp_size} \ - actor_rollout_ref.rollout.gpu_memory_utilization=0.80 \ - actor_rollout_ref.rollout.tensor_model_parallel_size=${gen_tp} \ - actor_rollout_ref.rollout.enable_chunked_prefill=True \ - actor_rollout_ref.rollout.max_num_batched_tokens=$((max_prompt_length + max_response_length)) \ - actor_rollout_ref.rollout.temperature=${temperature} \ - actor_rollout_ref.rollout.top_p=${top_p} \ - actor_rollout_ref.rollout.top_k=${top_k} \ - actor_rollout_ref.rollout.val_kwargs.temperature=${temperature} \ - actor_rollout_ref.rollout.val_kwargs.top_p=${val_top_p} \ - actor_rollout_ref.rollout.val_kwargs.top_k=${top_k} \ - actor_rollout_ref.rollout.val_kwargs.do_sample=True \ - actor_rollout_ref.rollout.val_kwargs.n=1 \ - actor_rollout_ref.ref.fsdp_config.param_offload=${ref_offload} \ - actor_rollout_ref.ref.ulysses_sequence_parallel_size=${sp_size} \ - actor_rollout_ref.actor.fsdp_config.fsdp_size=${fsdp_size} \ - actor_rollout_ref.rollout.name=${rollout_name} \ - actor_rollout_ref.rollout.mode=${rollout_mode} \ - actor_rollout_ref.rollout.calculate_log_probs=True \ - reward_model.reward_manager=dapo \ - +reward_model.reward_kwargs.overlong_buffer_cfg.enable=${enable_overlong_buffer} \ - +reward_model.reward_kwargs.overlong_buffer_cfg.len=${overlong_buffer_len} \ - +reward_model.reward_kwargs.overlong_buffer_cfg.penalty_factor=${overlong_penalty_factor} \ - +reward_model.reward_kwargs.overlong_buffer_cfg.log=False \ - +reward_model.reward_kwargs.max_resp_len=${max_response_length} \ - trainer.logger=['console','tensorboard'] \ - trainer.project_name="${project_name}" \ - trainer.experiment_name="${exp_name}" \ - trainer.val_before_train=True \ - trainer.save_freq=-1 \ - trainer.default_local_dir="${CKPTS_DIR}" \ - trainer.resume_mode=auto \ - trainer.nnodes="${NNODES_TRAIN}" \ - trainer.n_gpus_per_node="${NGPUS_PER_NODE}" \ - rollout.nnodes="${NNODES_ROLLOUT}" \ - rollout.n_gpus_per_node="${NGPUS_PER_NODE}" \ - rollout.total_rollout_steps="${total_rollout_steps}" \ - rollout.total_epochs=10 \ - rollout.test_freq="${test_freq}" \ - async_training.staleness_threshold="${staleness_threshold}" \ - async_training.trigger_parameter_sync_step="${trigger_parameter_sync_step}" \ - async_training.partial_rollout="${partial_rollout}" diff --git a/recipe/fully_async_policy/exp/qwen2-7B-math_64/fsdp2_fully-async_24-40/runtime_env.yaml b/recipe/fully_async_policy/exp/qwen2-7B-math_64/fsdp2_fully-async_24-40/runtime_env.yaml deleted file mode 100644 index ef67409ba6f..00000000000 --- a/recipe/fully_async_policy/exp/qwen2-7B-math_64/fsdp2_fully-async_24-40/runtime_env.yaml +++ /dev/null @@ -1,5 +0,0 @@ -env_vars: - VLLM_USE_V1: "1" - TENSORBOARD_DIR: "/mnt/dolphinfs/ssd_pool/docker/user/hadoop-friday-studio/FTI/houzhenggang/expeirments/qwen2-7B-math-64/dapo_qwen2-7B-math_28k_fsdp2_fully-async_24-40_mbs32_tpf6" - NCCL_DEBUG: "INFO" - HYDRA_FULL_ERROR: "1" \ No newline at end of file diff --git a/recipe/fully_async_policy/exp/qwen2-7B-math_64/fsdp2_fully-async_32-32/run.sh b/recipe/fully_async_policy/exp/qwen2-7B-math_64/fsdp2_fully-async_32-32/run.sh deleted file mode 100644 index cc26be4f100..00000000000 --- a/recipe/fully_async_policy/exp/qwen2-7B-math_64/fsdp2_fully-async_32-32/run.sh +++ /dev/null @@ -1,168 +0,0 @@ -#!/usr/bin/env bash -set -xeuo pipefail - -project_name='DAPO' -exp_name='dapo_qwen2-7B-math_28k_fsdp2_fully-async_32-32_mbs32_tpf8_fixmcs' - -# Ray -# RAY_ADDRESS=${RAY_ADDRESS:-"http://localhost:8265"} -# WORKING_DIR=${WORKING_DIR:-"${PWD}"} -# RUNTIME_ENV=${RUNTIME_ENV:-"${WORKING_DIR}/verl/trainer/runtime_env.yaml"} -# Paths -RAY_DATA_HOME=${RAY_DATA_HOME:-"${HOME}/verl"} -# very important! please modify the max_position_embeddings in config.json to 32768 after downloading from huggingface -MODEL_PATH=${MODEL_PATH:-"${RAY_DATA_HOME}/models/Qwen2.5-Math-7B"} -CKPTS_DIR=${CKPTS_DIR:-"${RAY_DATA_HOME}/ckpts/${project_name}/${exp_name}"} -TRAIN_FILE=${TRAIN_FILE:-"${RAY_DATA_HOME}/data/dapo-math-17k.parquet"} -TEST_FILE=${TEST_FILE:-"${RAY_DATA_HOME}/data/aime-2024.parquet"} - -# MODEL_PATH=/mnt/dolphinfs/hdd_pool/docker/user/hadoop-djst-algoplat/houzhenggang/modelscope/hub/models/Qwen/Qwen2___5-Math-7B -MODEL_PATH=/mnt/dolphinfs/ssd_pool/docker/user/hadoop-friday-studio/FTI/houzhenggang/model/Qwen2___5-Math-7B -CKPTS_DIR=./ckpts/${project_name}/${exp_name} -# TRAIN_FILE=/mnt/dolphinfs/hdd_pool/docker/user/hadoop-djst-algoplat/houzhenggang/data/dapo/dapo-math-17k.parquet -# TEST_FILE=/mnt/dolphinfs/hdd_pool/docker/user/hadoop-djst-algoplat/houzhenggang/data/dapo/aime-2024.parquet -TRAIN_FILE=/mnt/dolphinfs/ssd_pool/docker/user/hadoop-friday-studio/FTI/houzhenggang/data/dapo/dapo-math-17k.parquet -TEST_FILE=/mnt/dolphinfs/ssd_pool/docker/user/hadoop-friday-studio/FTI/houzhenggang/data/dapo/aime-2024.parquet - -rollout_mode="async" -rollout_name="vllm" # sglang or vllm -if [ "$rollout_mode" = "async" ]; then - export VLLM_USE_V1=1 - return_raw_chat="True" -fi - -# Algorithm parameters -adv_estimator=grpo - -use_kl_in_reward=False -kl_coef=0.0 -use_kl_loss=False -kl_loss_coef=0.0 - -clip_ratio_low=0.2 -clip_ratio_high=0.28 - -# Response length parameters -max_prompt_length=$((1024 * 2)) -max_response_length=$((1024 * 28)) -enable_overlong_buffer=True -overlong_buffer_len=$((1024 * 4)) -overlong_penalty_factor=1.0 - -# Training parameters -loss_agg_mode="token-mean" - -# Algorithm -temperature=1.0 -top_p=1.0 -top_k=-1 # 0 for HF rollout, -1 for vLLM rollout -val_top_p=0.7 - -# Performance Related Parameter -use_dynamic_bsz=True -actor_ppo_max_token_len=$(((max_prompt_length + max_response_length) * 2)) -infer_ppo_max_token_len=$(((max_prompt_length + max_response_length) * 3)) -ref_offload=True -actor_offload=False -gen_tp=4 -sp_size=4 -fsdp_size=2 - -# Fully async specific parameters -NNODES_ROLLOUT=${NNODES_ROLLOUT:-4} -NNODES_TRAIN=${NNODES_TRAIN:-4} -NGPUS_PER_NODE=${NGPUS_PER_NODE:-8} - - -train_prompt_bsz=0 -gen_prompt_bsz=1 -n_resp_per_prompt=16 -train_prompt_mini_bsz=32 -total_rollout_steps=$(((512*400))) -test_freq=20 -staleness_threshold=0.1 -trigger_parameter_sync_step=8 -partial_rollout=True - -python -m recipe.fully_async_policy.fully_async_main \ - data.train_files="${TRAIN_FILE}" \ - data.val_files="${TEST_FILE}" \ - data.prompt_key=prompt \ - data.truncation='left' \ - data.max_prompt_length=${max_prompt_length} \ - data.max_response_length=${max_response_length} \ - data.train_batch_size=${train_prompt_bsz} \ - data.gen_batch_size=${gen_prompt_bsz} \ - data.return_raw_chat=${return_raw_chat} \ - actor_rollout_ref.rollout.n=${n_resp_per_prompt} \ - algorithm.adv_estimator=${adv_estimator} \ - algorithm.use_kl_in_reward=${use_kl_in_reward} \ - algorithm.kl_ctrl.kl_coef=${kl_coef} \ - actor_rollout_ref.actor.strategy=fsdp2 \ - critic.strategy=fsdp2 \ - actor_rollout_ref.actor.use_kl_loss=${use_kl_loss} \ - actor_rollout_ref.actor.kl_loss_coef=${kl_loss_coef} \ - actor_rollout_ref.actor.clip_ratio_low=${clip_ratio_low} \ - actor_rollout_ref.actor.clip_ratio_high=${clip_ratio_high} \ - actor_rollout_ref.actor.clip_ratio_c=10.0 \ - actor_rollout_ref.model.use_remove_padding=True \ - actor_rollout_ref.hybrid_engine=False \ - +actor_rollout_ref.model.override_config.max_position_embeddings=32768 \ - actor_rollout_ref.actor.use_dynamic_bsz=${use_dynamic_bsz} \ - actor_rollout_ref.ref.log_prob_use_dynamic_bsz=${use_dynamic_bsz} \ - actor_rollout_ref.rollout.log_prob_use_dynamic_bsz=${use_dynamic_bsz} \ - actor_rollout_ref.actor.ppo_max_token_len_per_gpu=${actor_ppo_max_token_len} \ - actor_rollout_ref.ref.log_prob_max_token_len_per_gpu=${infer_ppo_max_token_len} \ - actor_rollout_ref.rollout.log_prob_max_token_len_per_gpu=${infer_ppo_max_token_len} \ - actor_rollout_ref.model.path="${MODEL_PATH}" \ - actor_rollout_ref.actor.optim.lr=1e-6 \ - actor_rollout_ref.actor.optim.lr_warmup_steps=10 \ - actor_rollout_ref.actor.optim.weight_decay=0.1 \ - actor_rollout_ref.actor.ppo_mini_batch_size=${train_prompt_mini_bsz} \ - actor_rollout_ref.actor.fsdp_config.param_offload=${actor_offload} \ - actor_rollout_ref.actor.fsdp_config.optimizer_offload=${actor_offload} \ - actor_rollout_ref.actor.entropy_coeff=0 \ - actor_rollout_ref.actor.grad_clip=1.0 \ - actor_rollout_ref.actor.loss_agg_mode=${loss_agg_mode} \ - actor_rollout_ref.actor.ulysses_sequence_parallel_size=${sp_size} \ - actor_rollout_ref.rollout.gpu_memory_utilization=0.80 \ - actor_rollout_ref.rollout.tensor_model_parallel_size=${gen_tp} \ - actor_rollout_ref.rollout.enable_chunked_prefill=True \ - actor_rollout_ref.rollout.max_num_batched_tokens=$((max_prompt_length + max_response_length)) \ - actor_rollout_ref.rollout.temperature=${temperature} \ - actor_rollout_ref.rollout.top_p=${top_p} \ - actor_rollout_ref.rollout.top_k=${top_k} \ - actor_rollout_ref.rollout.val_kwargs.temperature=${temperature} \ - actor_rollout_ref.rollout.val_kwargs.top_p=${val_top_p} \ - actor_rollout_ref.rollout.val_kwargs.top_k=${top_k} \ - actor_rollout_ref.rollout.val_kwargs.do_sample=True \ - actor_rollout_ref.rollout.val_kwargs.n=1 \ - actor_rollout_ref.ref.fsdp_config.param_offload=${ref_offload} \ - actor_rollout_ref.ref.ulysses_sequence_parallel_size=${sp_size} \ - actor_rollout_ref.actor.fsdp_config.fsdp_size=${fsdp_size} \ - actor_rollout_ref.rollout.name=${rollout_name} \ - actor_rollout_ref.rollout.mode=${rollout_mode} \ - actor_rollout_ref.rollout.calculate_log_probs=True \ - reward_model.reward_manager=dapo \ - +reward_model.reward_kwargs.overlong_buffer_cfg.enable=${enable_overlong_buffer} \ - +reward_model.reward_kwargs.overlong_buffer_cfg.len=${overlong_buffer_len} \ - +reward_model.reward_kwargs.overlong_buffer_cfg.penalty_factor=${overlong_penalty_factor} \ - +reward_model.reward_kwargs.overlong_buffer_cfg.log=False \ - +reward_model.reward_kwargs.max_resp_len=${max_response_length} \ - trainer.logger=['console','tensorboard'] \ - trainer.project_name="${project_name}" \ - trainer.experiment_name="${exp_name}" \ - trainer.val_before_train=True \ - trainer.save_freq=-1 \ - trainer.default_local_dir="${CKPTS_DIR}" \ - trainer.resume_mode=auto \ - trainer.nnodes="${NNODES_TRAIN}" \ - trainer.n_gpus_per_node="${NGPUS_PER_NODE}" \ - rollout.nnodes="${NNODES_ROLLOUT}" \ - rollout.n_gpus_per_node="${NGPUS_PER_NODE}" \ - rollout.total_rollout_steps="${total_rollout_steps}" \ - rollout.total_epochs=10 \ - rollout.test_freq="${test_freq}" \ - async_training.staleness_threshold="${staleness_threshold}" \ - async_training.trigger_parameter_sync_step="${trigger_parameter_sync_step}" \ - async_training.partial_rollout="${partial_rollout}" diff --git a/recipe/fully_async_policy/exp/qwen2-7B-math_64/fsdp2_fully-async_32-32/runtime_env.yaml b/recipe/fully_async_policy/exp/qwen2-7B-math_64/fsdp2_fully-async_32-32/runtime_env.yaml deleted file mode 100644 index 160cd46c499..00000000000 --- a/recipe/fully_async_policy/exp/qwen2-7B-math_64/fsdp2_fully-async_32-32/runtime_env.yaml +++ /dev/null @@ -1,4 +0,0 @@ -env_vars: - VLLM_USE_V1: "1" - TENSORBOARD_DIR: "/mnt/dolphinfs/ssd_pool/docker/user/hadoop-friday-studio/FTI/houzhenggang/expeirments/qwen2-7B-math-64/dapo_qwen2-7B-math_28k_fsdp2_fully-async_32-32_mbs32_tpf8_fixmcs" - HYDRA_FULL_ERROR: "1" \ No newline at end of file diff --git a/recipe/fully_async_policy/exp/qwen2-7B-math_64/fsdp2_fully-async_40-24/run.sh b/recipe/fully_async_policy/exp/qwen2-7B-math_64/fsdp2_fully-async_40-24/run.sh deleted file mode 100644 index 0a67a563819..00000000000 --- a/recipe/fully_async_policy/exp/qwen2-7B-math_64/fsdp2_fully-async_40-24/run.sh +++ /dev/null @@ -1,168 +0,0 @@ -#!/usr/bin/env bash -set -xeuo pipefail - -project_name='DAPO' -exp_name='dapo_qwen2-7B-math_28k_fsdp2_fully-async_40-24_mbs32_tpf11' - -# Ray -# RAY_ADDRESS=${RAY_ADDRESS:-"http://localhost:8265"} -# WORKING_DIR=${WORKING_DIR:-"${PWD}"} -# RUNTIME_ENV=${RUNTIME_ENV:-"${WORKING_DIR}/verl/trainer/runtime_env.yaml"} -# Paths -RAY_DATA_HOME=${RAY_DATA_HOME:-"${HOME}/verl"} -# very important! please modify the max_position_embeddings in config.json to 32768 after downloading from huggingface -MODEL_PATH=${MODEL_PATH:-"${RAY_DATA_HOME}/models/Qwen2.5-Math-7B"} -CKPTS_DIR=${CKPTS_DIR:-"${RAY_DATA_HOME}/ckpts/${project_name}/${exp_name}"} -TRAIN_FILE=${TRAIN_FILE:-"${RAY_DATA_HOME}/data/dapo-math-17k.parquet"} -TEST_FILE=${TEST_FILE:-"${RAY_DATA_HOME}/data/aime-2024.parquet"} - -# MODEL_PATH=/mnt/dolphinfs/hdd_pool/docker/user/hadoop-djst-algoplat/houzhenggang/modelscope/hub/models/Qwen/Qwen2___5-Math-7B -MODEL_PATH=/mnt/dolphinfs/ssd_pool/docker/user/hadoop-friday-studio/FTI/houzhenggang/model/Qwen2___5-Math-7B -CKPTS_DIR=./ckpts/${project_name}/${exp_name} -# TRAIN_FILE=/mnt/dolphinfs/hdd_pool/docker/user/hadoop-djst-algoplat/houzhenggang/data/dapo/dapo-math-17k.parquet -# TEST_FILE=/mnt/dolphinfs/hdd_pool/docker/user/hadoop-djst-algoplat/houzhenggang/data/dapo/aime-2024.parquet -TRAIN_FILE=/mnt/dolphinfs/ssd_pool/docker/user/hadoop-friday-studio/FTI/houzhenggang/data/dapo/dapo-math-17k.parquet -TEST_FILE=/mnt/dolphinfs/ssd_pool/docker/user/hadoop-friday-studio/FTI/houzhenggang/data/dapo/aime-2024.parquet - -rollout_mode="async" -rollout_name="vllm" # sglang or vllm -if [ "$rollout_mode" = "async" ]; then - export VLLM_USE_V1=1 - return_raw_chat="True" -fi - -# Algorithm parameters -adv_estimator=grpo - -use_kl_in_reward=False -kl_coef=0.0 -use_kl_loss=False -kl_loss_coef=0.0 - -clip_ratio_low=0.2 -clip_ratio_high=0.28 - -# Response length parameters -max_prompt_length=$((1024 * 2)) -max_response_length=$((1024 * 28)) -enable_overlong_buffer=True -overlong_buffer_len=$((1024 * 4)) -overlong_penalty_factor=1.0 - -# Training parameters -loss_agg_mode="token-mean" - -# Algorithm -temperature=1.0 -top_p=1.0 -top_k=-1 # 0 for HF rollout, -1 for vLLM rollout -val_top_p=0.7 - -# Performance Related Parameter -use_dynamic_bsz=True -actor_ppo_max_token_len=$(((max_prompt_length + max_response_length) * 2)) -infer_ppo_max_token_len=$(((max_prompt_length + max_response_length) * 3)) -ref_offload=True -actor_offload=False -gen_tp=4 -sp_size=4 -fsdp_size=2 - -# Fully async specific parameters -NNODES_ROLLOUT=${NNODES_ROLLOUT:-5} -NNODES_TRAIN=${NNODES_TRAIN:-3} -NGPUS_PER_NODE=${NGPUS_PER_NODE:-8} - - -train_prompt_bsz=0 -gen_prompt_bsz=1 -n_resp_per_prompt=16 -train_prompt_mini_bsz=32 -total_rollout_steps=$(((512*400))) -test_freq=20 -staleness_threshold=0.1 -trigger_parameter_sync_step=11 -partial_rollout=True - -python -m recipe.fully_async_policy.fully_async_main \ - data.train_files="${TRAIN_FILE}" \ - data.val_files="${TEST_FILE}" \ - data.prompt_key=prompt \ - data.truncation='left' \ - data.max_prompt_length=${max_prompt_length} \ - data.max_response_length=${max_response_length} \ - data.train_batch_size=${train_prompt_bsz} \ - data.gen_batch_size=${gen_prompt_bsz} \ - data.return_raw_chat=${return_raw_chat} \ - actor_rollout_ref.rollout.n=${n_resp_per_prompt} \ - algorithm.adv_estimator=${adv_estimator} \ - algorithm.use_kl_in_reward=${use_kl_in_reward} \ - algorithm.kl_ctrl.kl_coef=${kl_coef} \ - actor_rollout_ref.actor.strategy=fsdp2 \ - critic.strategy=fsdp2 \ - actor_rollout_ref.actor.use_kl_loss=${use_kl_loss} \ - actor_rollout_ref.actor.kl_loss_coef=${kl_loss_coef} \ - actor_rollout_ref.actor.clip_ratio_low=${clip_ratio_low} \ - actor_rollout_ref.actor.clip_ratio_high=${clip_ratio_high} \ - actor_rollout_ref.actor.clip_ratio_c=10.0 \ - actor_rollout_ref.model.use_remove_padding=True \ - actor_rollout_ref.hybrid_engine=False \ - +actor_rollout_ref.model.override_config.max_position_embeddings=32768 \ - actor_rollout_ref.actor.use_dynamic_bsz=${use_dynamic_bsz} \ - actor_rollout_ref.ref.log_prob_use_dynamic_bsz=${use_dynamic_bsz} \ - actor_rollout_ref.rollout.log_prob_use_dynamic_bsz=${use_dynamic_bsz} \ - actor_rollout_ref.actor.ppo_max_token_len_per_gpu=${actor_ppo_max_token_len} \ - actor_rollout_ref.ref.log_prob_max_token_len_per_gpu=${infer_ppo_max_token_len} \ - actor_rollout_ref.rollout.log_prob_max_token_len_per_gpu=${infer_ppo_max_token_len} \ - actor_rollout_ref.model.path="${MODEL_PATH}" \ - actor_rollout_ref.actor.optim.lr=1e-6 \ - actor_rollout_ref.actor.optim.lr_warmup_steps=10 \ - actor_rollout_ref.actor.optim.weight_decay=0.1 \ - actor_rollout_ref.actor.ppo_mini_batch_size=${train_prompt_mini_bsz} \ - actor_rollout_ref.actor.fsdp_config.param_offload=${actor_offload} \ - actor_rollout_ref.actor.fsdp_config.optimizer_offload=${actor_offload} \ - actor_rollout_ref.actor.entropy_coeff=0 \ - actor_rollout_ref.actor.grad_clip=1.0 \ - actor_rollout_ref.actor.loss_agg_mode=${loss_agg_mode} \ - actor_rollout_ref.actor.ulysses_sequence_parallel_size=${sp_size} \ - actor_rollout_ref.rollout.gpu_memory_utilization=0.80 \ - actor_rollout_ref.rollout.tensor_model_parallel_size=${gen_tp} \ - actor_rollout_ref.rollout.enable_chunked_prefill=True \ - actor_rollout_ref.rollout.max_num_batched_tokens=$((max_prompt_length + max_response_length)) \ - actor_rollout_ref.rollout.temperature=${temperature} \ - actor_rollout_ref.rollout.top_p=${top_p} \ - actor_rollout_ref.rollout.top_k=${top_k} \ - actor_rollout_ref.rollout.val_kwargs.temperature=${temperature} \ - actor_rollout_ref.rollout.val_kwargs.top_p=${val_top_p} \ - actor_rollout_ref.rollout.val_kwargs.top_k=${top_k} \ - actor_rollout_ref.rollout.val_kwargs.do_sample=True \ - actor_rollout_ref.rollout.val_kwargs.n=1 \ - actor_rollout_ref.ref.fsdp_config.param_offload=${ref_offload} \ - actor_rollout_ref.ref.ulysses_sequence_parallel_size=${sp_size} \ - actor_rollout_ref.actor.fsdp_config.fsdp_size=${fsdp_size} \ - actor_rollout_ref.rollout.name=${rollout_name} \ - actor_rollout_ref.rollout.mode=${rollout_mode} \ - actor_rollout_ref.rollout.calculate_log_probs=True \ - reward_model.reward_manager=dapo \ - +reward_model.reward_kwargs.overlong_buffer_cfg.enable=${enable_overlong_buffer} \ - +reward_model.reward_kwargs.overlong_buffer_cfg.len=${overlong_buffer_len} \ - +reward_model.reward_kwargs.overlong_buffer_cfg.penalty_factor=${overlong_penalty_factor} \ - +reward_model.reward_kwargs.overlong_buffer_cfg.log=False \ - +reward_model.reward_kwargs.max_resp_len=${max_response_length} \ - trainer.logger=['console','tensorboard'] \ - trainer.project_name="${project_name}" \ - trainer.experiment_name="${exp_name}" \ - trainer.val_before_train=True \ - trainer.save_freq=-1 \ - trainer.default_local_dir="${CKPTS_DIR}" \ - trainer.resume_mode=auto \ - trainer.nnodes="${NNODES_TRAIN}" \ - trainer.n_gpus_per_node="${NGPUS_PER_NODE}" \ - rollout.nnodes="${NNODES_ROLLOUT}" \ - rollout.n_gpus_per_node="${NGPUS_PER_NODE}" \ - rollout.total_rollout_steps="${total_rollout_steps}" \ - rollout.total_epochs=10 \ - rollout.test_freq="${test_freq}" \ - async_training.staleness_threshold="${staleness_threshold}" \ - async_training.trigger_parameter_sync_step="${trigger_parameter_sync_step}" \ - async_training.partial_rollout="${partial_rollout}" diff --git a/recipe/fully_async_policy/exp/qwen2-7B-math_64/fsdp2_fully-async_40-24/runtime_env.yaml b/recipe/fully_async_policy/exp/qwen2-7B-math_64/fsdp2_fully-async_40-24/runtime_env.yaml deleted file mode 100644 index 93ae17ebb6f..00000000000 --- a/recipe/fully_async_policy/exp/qwen2-7B-math_64/fsdp2_fully-async_40-24/runtime_env.yaml +++ /dev/null @@ -1,5 +0,0 @@ -env_vars: - VLLM_USE_V1: "1" - TENSORBOARD_DIR: "/mnt/dolphinfs/ssd_pool/docker/user/hadoop-friday-studio/FTI/houzhenggang/expeirments/qwen2-7B-math-64/dapo_qwen2-7B-math_28k_fsdp2_fully-async_40-24_mbs32_tpf11" - NCCL_DEBUG: "INFO" - HYDRA_FULL_ERROR: "1" \ No newline at end of file diff --git a/recipe/fully_async_policy/exp/qwen2-7B-math_64/megatron_colocate/run.sh b/recipe/fully_async_policy/exp/qwen2-7B-math_64/megatron_colocate/run.sh deleted file mode 100644 index f98aeb86b57..00000000000 --- a/recipe/fully_async_policy/exp/qwen2-7B-math_64/megatron_colocate/run.sh +++ /dev/null @@ -1,135 +0,0 @@ -#!/usr/bin/env bash -set -xeuo pipefail - -project_name='DAPO' -exp_name='dapo_qwen2-7B-math_28k_megatron_colocate_64_mbs32' - -adv_estimator=grpo - -use_kl_in_reward=False -kl_coef=0.0 -use_kl_loss=False -kl_loss_coef=0.0 - -clip_ratio_low=0.2 -clip_ratio_high=0.28 - -max_prompt_length=$((1024 * 2)) -max_response_length=$((1024 * 28)) -enable_overlong_buffer=True -overlong_buffer_len=$((1024 * 4)) -overlong_penalty_factor=1.0 - -loss_agg_mode="token-mean" - -train_prompt_bsz=512 -n_resp_per_prompt=16 -train_prompt_mini_bsz=32 - -# Ray -# RAY_ADDRESS=${RAY_ADDRESS:-"http://localhost:8265"} -# WORKING_DIR=${WORKING_DIR:-"${PWD}"} -# RUNTIME_ENV=${RUNTIME_ENV:-"${WORKING_DIR}/verl/trainer/runtime_env.yaml"} -NNODES=${NNODES:-8} -NGPUS_PER_NODE=${NGPUS_PER_NODE:-8} -# Paths -MODEL_PATH=/mnt/dolphinfs/ssd_pool/docker/user/hadoop-friday-studio/FTI/houzhenggang/model/Qwen2___5-Math-7B -CKPTS_DIR=./ckpts/${project_name}/${exp_name} -TRAIN_FILE=/mnt/dolphinfs/ssd_pool/docker/user/hadoop-friday-studio/FTI/houzhenggang/data/dapo/dapo-math-17k.parquet -TEST_FILE=/mnt/dolphinfs/ssd_pool/docker/user/hadoop-friday-studio/FTI/houzhenggang/data/dapo/aime-2024.parquet - -# Algorithm -temperature=1.0 -top_p=1.0 -top_k=-1 # 0 for HF rollout, -1 for vLLM rollout -val_top_p=0.7 - -# Performance Related Parameter -use_dynamic_bsz=True -actor_ppo_max_token_len=$(((max_prompt_length + max_response_length) * 2)) -infer_ppo_max_token_len=$(((max_prompt_length + max_response_length) * 3)) -offload=True -gen_tp=4 -train_tp=4 -train_pp=2 - -# TODO: support dynamic_bsz for megatron -# actor_rollout_ref.actor.use_dynamic_bsz=${use_dynamic_bsz} \ -# actor_rollout_ref.ref.log_prob_use_dynamic_bsz=${use_dynamic_bsz} \ -# actor_rollout_ref.rollout.log_prob_use_dynamic_bsz=${use_dynamic_bsz} \ -# actor_rollout_ref.actor.ppo_max_token_len_per_gpu=${actor_ppo_max_token_len} \ -# actor_rollout_ref.ref.log_prob_max_token_len_per_gpu=${infer_ppo_max_token_len} \ -# actor_rollout_ref.rollout.log_prob_max_token_len_per_gpu=${infer_ppo_max_token_len} \ - -python3 -m verl.trainer.main_ppo \ - --config-path=config \ - --config-name='ppo_megatron_trainer.yaml' \ - data.train_files="${TRAIN_FILE}" \ - data.val_files="${TEST_FILE}" \ - data.prompt_key=prompt \ - data.truncation='left' \ - data.max_prompt_length=${max_prompt_length} \ - data.max_response_length=${max_response_length} \ - data.train_batch_size=${train_prompt_bsz} \ - actor_rollout_ref.rollout.n=${n_resp_per_prompt} \ - algorithm.adv_estimator=${adv_estimator} \ - algorithm.use_kl_in_reward=${use_kl_in_reward} \ - algorithm.kl_ctrl.kl_coef=${kl_coef} \ - actor_rollout_ref.actor.strategy=megatron \ - critic.strategy=megatron \ - actor_rollout_ref.actor.use_kl_loss=${use_kl_loss} \ - actor_rollout_ref.actor.kl_loss_coef=${kl_loss_coef} \ - actor_rollout_ref.actor.clip_ratio_low=${clip_ratio_low} \ - actor_rollout_ref.actor.clip_ratio_high=${clip_ratio_high} \ - actor_rollout_ref.actor.clip_ratio_c=10.0 \ - +actor_rollout_ref.model.override_config.max_position_embeddings=32768 \ - actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=2 \ - actor_rollout_ref.ref.log_prob_micro_batch_size_per_gpu=4 \ - actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu=4 \ - actor_rollout_ref.model.path="${MODEL_PATH}" \ - actor_rollout_ref.actor.optim.lr=1e-6 \ - actor_rollout_ref.actor.optim.lr_warmup_steps=10 \ - actor_rollout_ref.actor.optim.weight_decay=0.1 \ - actor_rollout_ref.actor.ppo_mini_batch_size=${train_prompt_mini_bsz} \ - actor_rollout_ref.actor.megatron.param_offload=${offload} \ - actor_rollout_ref.actor.megatron.optimizer_offload=${offload} \ - actor_rollout_ref.actor.megatron.grad_offload=${offload} \ - actor_rollout_ref.actor.megatron.pipeline_model_parallel_size=${train_pp} \ - actor_rollout_ref.actor.megatron.tensor_model_parallel_size=${train_tp} \ - actor_rollout_ref.actor.entropy_coeff=0 \ - actor_rollout_ref.actor.optim.clip_grad=1.0 \ - actor_rollout_ref.actor.loss_agg_mode=${loss_agg_mode} \ - actor_rollout_ref.rollout.gpu_memory_utilization=0.80 \ - actor_rollout_ref.rollout.tensor_model_parallel_size=${gen_tp} \ - actor_rollout_ref.rollout.enable_chunked_prefill=True \ - actor_rollout_ref.rollout.max_num_batched_tokens=$((max_prompt_length + max_response_length)) \ - actor_rollout_ref.rollout.temperature=${temperature} \ - actor_rollout_ref.rollout.top_p=${top_p} \ - actor_rollout_ref.rollout.top_k=${top_k} \ - actor_rollout_ref.rollout.val_kwargs.temperature=${temperature} \ - actor_rollout_ref.rollout.val_kwargs.top_p=${val_top_p} \ - actor_rollout_ref.rollout.val_kwargs.top_k=${top_k} \ - actor_rollout_ref.rollout.val_kwargs.do_sample=True \ - actor_rollout_ref.rollout.val_kwargs.n=1 \ - actor_rollout_ref.ref.megatron.pipeline_model_parallel_size=${train_pp} \ - actor_rollout_ref.ref.megatron.tensor_model_parallel_size=${train_tp} \ - actor_rollout_ref.ref.megatron.param_offload=${offload} \ - reward_model.reward_manager=dapo \ - +reward_model.reward_kwargs.overlong_buffer_cfg.enable=${enable_overlong_buffer} \ - +reward_model.reward_kwargs.overlong_buffer_cfg.len=${overlong_buffer_len} \ - +reward_model.reward_kwargs.overlong_buffer_cfg.penalty_factor=${overlong_penalty_factor} \ - +reward_model.reward_kwargs.overlong_buffer_cfg.log=False \ - +reward_model.reward_kwargs.max_resp_len=${max_response_length} \ - trainer.logger=['console','tensorboard'] \ - trainer.project_name="${project_name}" \ - trainer.experiment_name="${exp_name}" \ - trainer.n_gpus_per_node=8 \ - trainer.nnodes="${NNODES}" \ - trainer.val_before_train=True \ - trainer.test_freq=20 \ - trainer.save_freq=-1 \ - trainer.total_epochs=10 \ - trainer.total_training_steps=400 \ - trainer.default_local_dir="${CKPTS_DIR}" \ - trainer.resume_mode=auto \ - trainer.log_val_generations=10 diff --git a/recipe/fully_async_policy/exp/qwen2-7B-math_64/megatron_colocate/runtime_env.yaml b/recipe/fully_async_policy/exp/qwen2-7B-math_64/megatron_colocate/runtime_env.yaml deleted file mode 100644 index a8cd045e180..00000000000 --- a/recipe/fully_async_policy/exp/qwen2-7B-math_64/megatron_colocate/runtime_env.yaml +++ /dev/null @@ -1,3 +0,0 @@ -env_vars: - TENSORBOARD_DIR: "/mnt/dolphinfs/ssd_pool/docker/user/hadoop-friday-studio/FTI/houzhenggang/expeirments/qwen2-7B-math-64/dapo_qwen2-7B-math_28k_megatron_colocate_64_mbs32" - HYDRA_FULL_ERROR: "1" \ No newline at end of file diff --git a/recipe/fully_async_policy/exp/qwen3-30BA3B_128/fsdp2_colocate/run.sh b/recipe/fully_async_policy/exp/qwen3-30BA3B_128/fsdp2_colocate/run.sh deleted file mode 100644 index 591ac8533ee..00000000000 --- a/recipe/fully_async_policy/exp/qwen3-30BA3B_128/fsdp2_colocate/run.sh +++ /dev/null @@ -1,125 +0,0 @@ -#!/usr/bin/env bash -set -xeuo pipefail - -project_name='DAPO' -exp_name='dapo_qwen3-30BA3B-math_32k_fsdp2_colocate_128_mbs32' - -adv_estimator=grpo - -use_kl_in_reward=False -kl_coef=0.0 -use_kl_loss=False -kl_loss_coef=0.0 - -clip_ratio_low=0.2 -clip_ratio_high=0.28 - -max_prompt_length=$((1024 * 2)) -max_response_length=$((1024 * 32)) -enable_overlong_buffer=True -overlong_buffer_len=$((1024 * 4)) -overlong_penalty_factor=1.0 - -loss_agg_mode="token-mean" - -train_prompt_bsz=512 -n_resp_per_prompt=16 -train_prompt_mini_bsz=32 - -# Ray -# RAY_ADDRESS=${RAY_ADDRESS:-"http://localhost:8265"} -# WORKING_DIR=${WORKING_DIR:-"${PWD}"} -# RUNTIME_ENV=${RUNTIME_ENV:-"${WORKING_DIR}/verl/trainer/runtime_env.yaml"} -NNODES=${NNODES:-16} -NGPUS_PER_NODE=${NGPUS_PER_NODE:-8} -# Paths -MODEL_PATH=/mnt/dolphinfs/ssd_pool/docker/user/hadoop-friday-studio/.friday/models/basemodel/Qwen/Qwen3-30B-A3B -CKPTS_DIR=./ckpts/${project_name}/${exp_name} -TRAIN_FILE=/mnt/dolphinfs/ssd_pool/docker/user/hadoop-friday-studio/FTI/houzhenggang/data/dapo/dapo-math-17k.parquet -TEST_FILE=/mnt/dolphinfs/ssd_pool/docker/user/hadoop-friday-studio/FTI/houzhenggang/data/dapo/aime-2024.parquet - -# Algorithm -temperature=1.0 -top_p=1.0 -top_k=-1 # 0 for HF rollout, -1 for vLLM rollout -val_top_p=0.7 - -# Performance Related Parameter -sp_size=4 -use_dynamic_bsz=True -actor_ppo_max_token_len=$(((max_prompt_length + max_response_length) * 2)) -infer_ppo_max_token_len=$(((max_prompt_length + max_response_length) * 3)) -offload=True -gen_tp=4 -fsdp_size=32 - -python3 -m verl.trainer.main_ppo \ - data.train_files="${TRAIN_FILE}" \ - data.val_files="${TEST_FILE}" \ - data.prompt_key=prompt \ - data.truncation='left' \ - data.max_prompt_length=${max_prompt_length} \ - data.max_response_length=${max_response_length} \ - data.train_batch_size=${train_prompt_bsz} \ - actor_rollout_ref.rollout.n=${n_resp_per_prompt} \ - algorithm.adv_estimator=${adv_estimator} \ - algorithm.use_kl_in_reward=${use_kl_in_reward} \ - algorithm.kl_ctrl.kl_coef=${kl_coef} \ - actor_rollout_ref.actor.use_kl_loss=${use_kl_loss} \ - actor_rollout_ref.actor.kl_loss_coef=${kl_loss_coef} \ - actor_rollout_ref.actor.clip_ratio_low=${clip_ratio_low} \ - actor_rollout_ref.actor.clip_ratio_high=${clip_ratio_high} \ - actor_rollout_ref.actor.clip_ratio_c=10.0 \ - actor_rollout_ref.model.use_remove_padding=True \ - actor_rollout_ref.actor.use_dynamic_bsz=${use_dynamic_bsz} \ - actor_rollout_ref.ref.log_prob_use_dynamic_bsz=${use_dynamic_bsz} \ - actor_rollout_ref.rollout.log_prob_use_dynamic_bsz=${use_dynamic_bsz} \ - actor_rollout_ref.actor.ppo_max_token_len_per_gpu=${actor_ppo_max_token_len} \ - actor_rollout_ref.ref.log_prob_max_token_len_per_gpu=${infer_ppo_max_token_len} \ - actor_rollout_ref.rollout.log_prob_max_token_len_per_gpu=${infer_ppo_max_token_len} \ - actor_rollout_ref.model.path="${MODEL_PATH}" \ - actor_rollout_ref.model.enable_gradient_checkpointing=True \ - actor_rollout_ref.actor.optim.lr=1e-6 \ - actor_rollout_ref.actor.optim.lr_warmup_steps=10 \ - actor_rollout_ref.actor.optim.weight_decay=0.1 \ - actor_rollout_ref.actor.ppo_mini_batch_size=${train_prompt_mini_bsz} \ - actor_rollout_ref.actor.fsdp_config.param_offload=${offload} \ - actor_rollout_ref.actor.fsdp_config.optimizer_offload=${offload} \ - actor_rollout_ref.actor.entropy_coeff=0 \ - actor_rollout_ref.actor.grad_clip=1.0 \ - actor_rollout_ref.actor.loss_agg_mode=${loss_agg_mode} \ - actor_rollout_ref.actor.ulysses_sequence_parallel_size=${sp_size} \ - actor_rollout_ref.rollout.gpu_memory_utilization=0.80 \ - actor_rollout_ref.rollout.tensor_model_parallel_size=${gen_tp} \ - actor_rollout_ref.rollout.enable_chunked_prefill=True \ - actor_rollout_ref.rollout.max_num_batched_tokens=$((max_prompt_length + max_response_length)) \ - actor_rollout_ref.rollout.temperature=${temperature} \ - actor_rollout_ref.rollout.top_p=${top_p} \ - actor_rollout_ref.rollout.top_k=${top_k} \ - actor_rollout_ref.rollout.val_kwargs.temperature=${temperature} \ - actor_rollout_ref.rollout.val_kwargs.top_p=${val_top_p} \ - actor_rollout_ref.rollout.val_kwargs.top_k=${top_k} \ - actor_rollout_ref.rollout.val_kwargs.do_sample=True \ - actor_rollout_ref.rollout.val_kwargs.n=1 \ - actor_rollout_ref.ref.fsdp_config.param_offload=${offload} \ - actor_rollout_ref.ref.ulysses_sequence_parallel_size=${sp_size} \ - actor_rollout_ref.actor.fsdp_config.fsdp_size=${fsdp_size} \ - reward_model.reward_manager=dapo \ - +reward_model.reward_kwargs.overlong_buffer_cfg.enable=${enable_overlong_buffer} \ - +reward_model.reward_kwargs.overlong_buffer_cfg.len=${overlong_buffer_len} \ - +reward_model.reward_kwargs.overlong_buffer_cfg.penalty_factor=${overlong_penalty_factor} \ - +reward_model.reward_kwargs.overlong_buffer_cfg.log=False \ - +reward_model.reward_kwargs.max_resp_len=${max_response_length} \ - trainer.logger='["console","tensorboard"]' \ - trainer.project_name="${project_name}" \ - trainer.experiment_name="${exp_name}" \ - trainer.n_gpus_per_node="${NGPUS_PER_NODE}" \ - trainer.nnodes="${NNODES}" \ - trainer.val_before_train=True \ - trainer.test_freq=20 \ - trainer.save_freq=-1 \ - trainer.total_epochs=10 \ - trainer.total_training_steps=400 \ - trainer.default_local_dir="${CKPTS_DIR}" \ - trainer.resume_mode=auto \ - trainer.log_val_generations=10 diff --git a/recipe/fully_async_policy/exp/qwen3-30BA3B_128/fsdp2_colocate/runtime_env.yaml b/recipe/fully_async_policy/exp/qwen3-30BA3B_128/fsdp2_colocate/runtime_env.yaml deleted file mode 100644 index 069b1f14aa0..00000000000 --- a/recipe/fully_async_policy/exp/qwen3-30BA3B_128/fsdp2_colocate/runtime_env.yaml +++ /dev/null @@ -1,3 +0,0 @@ -env_vars: - TENSORBOARD_DIR: "/mnt/dolphinfs/ssd_pool/docker/user/hadoop-friday-studio/FTI/houzhenggang/expeirments/dapo_qwen3-30BA3B/dapo_qwen3-30BA3B-math_32k_fsdp2_colocate_128_mbs32" - HYDRA_FULL_ERROR: "1" \ No newline at end of file diff --git a/recipe/fully_async_policy/exp/qwen3-30BA3B_128/megatron_colocate/run.sh b/recipe/fully_async_policy/exp/qwen3-30BA3B_128/megatron_colocate/run.sh deleted file mode 100644 index c666034ffc3..00000000000 --- a/recipe/fully_async_policy/exp/qwen3-30BA3B_128/megatron_colocate/run.sh +++ /dev/null @@ -1,161 +0,0 @@ -#!/usr/bin/env bash -set -xeuo pipefail - -project_name='DAPO' -exp_name='dapo_qwen3-30BA3B_32k_megatron_colocate_128_mbs32' - -adv_estimator=grpo - -use_kl_in_reward=False -kl_coef=0.0 -use_kl_loss=False -kl_loss_coef=0.0 - -clip_ratio_low=0.2 -clip_ratio_high=0.28 - -max_prompt_length=$((1024 * 2)) -max_response_length=$((1024 * 32)) -enable_overlong_buffer=True -overlong_buffer_len=$((1024 * 4)) -overlong_penalty_factor=1.0 - -loss_agg_mode="token-mean" - -train_prompt_bsz=512 -train_prompt_mini_bsz=32 -n_resp_per_prompt=16 - -NNODES=${NNODES:-16} -NGPUS_PER_NODE=${NGPUS_PER_NODE:-8} -# Paths -RAY_DATA_HOME=${RAY_DATA_HOME:-"${HOME}/verl"} -MODEL_PATH=/mnt/dolphinfs/ssd_pool/docker/user/hadoop-friday-studio/.friday/models/basemodel/Qwen/Qwen3-30B-A3B -CKPTS_DIR=./ckpts/${project_name}/${exp_name} -TRAIN_FILE=/mnt/dolphinfs/ssd_pool/docker/user/hadoop-friday-studio/FTI/houzhenggang/data/dapo/dapo-math-17k.parquet -TEST_FILE=/mnt/dolphinfs/ssd_pool/docker/user/hadoop-friday-studio/FTI/houzhenggang/data/dapo/aime-2024.parquet - -# Algorithm -temperature=1.0 -top_p=1.0 -top_k=-1 # 0 for HF rollout, -1 for vLLM rollout -val_top_p=0.7 - - -# Performance Related Parameter -use_dynamic_bsz=True -actor_ppo_max_token_len=$((max_prompt_length + max_response_length)) -infer_ppo_max_token_len=$((max_prompt_length + max_response_length)) -offload=True -gen_tp=4 -train_tp=1 -train_pp=1 -EP=8 -ETP=1 -CP=1 - -python3 -m verl.trainer.main_ppo \ - --config-path=config \ - --config-name='ppo_megatron_trainer.yaml' \ - data.train_files="${TRAIN_FILE}" \ - data.val_files="${TEST_FILE}" \ - data.prompt_key=prompt \ - data.truncation='left' \ - data.max_prompt_length=${max_prompt_length} \ - data.max_response_length=${max_response_length} \ - data.train_batch_size=${train_prompt_bsz} \ - actor_rollout_ref.rollout.n=${n_resp_per_prompt} \ - algorithm.adv_estimator=${adv_estimator} \ - algorithm.use_kl_in_reward=${use_kl_in_reward} \ - algorithm.kl_ctrl.kl_coef=${kl_coef} \ - actor_rollout_ref.actor.use_kl_loss=${use_kl_loss} \ - actor_rollout_ref.actor.kl_loss_coef=${kl_loss_coef} \ - actor_rollout_ref.actor.clip_ratio_low=${clip_ratio_low} \ - actor_rollout_ref.actor.clip_ratio_high=${clip_ratio_high} \ - actor_rollout_ref.actor.clip_ratio_c=10.0 \ - actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=2 \ - actor_rollout_ref.ref.log_prob_micro_batch_size_per_gpu=4 \ - actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu=4 \ - actor_rollout_ref.model.path="${MODEL_PATH}" \ - actor_rollout_ref.actor.optim.lr=1e-6 \ - actor_rollout_ref.actor.optim.lr_warmup_steps=10 \ - actor_rollout_ref.actor.optim.weight_decay=0.1 \ - actor_rollout_ref.actor.ppo_mini_batch_size=${train_prompt_mini_bsz} \ - actor_rollout_ref.actor.entropy_coeff=0 \ - actor_rollout_ref.actor.optim.clip_grad=1.0 \ - actor_rollout_ref.actor.loss_agg_mode=${loss_agg_mode} \ - actor_rollout_ref.rollout.gpu_memory_utilization=0.80 \ - actor_rollout_ref.rollout.tensor_model_parallel_size=${gen_tp} \ - actor_rollout_ref.rollout.enable_chunked_prefill=True \ - actor_rollout_ref.rollout.max_num_batched_tokens=$((max_prompt_length + max_response_length)) \ - actor_rollout_ref.rollout.temperature=${temperature} \ - actor_rollout_ref.rollout.top_p=${top_p} \ - actor_rollout_ref.rollout.top_k=${top_k} \ - actor_rollout_ref.rollout.val_kwargs.temperature=${temperature} \ - actor_rollout_ref.rollout.val_kwargs.top_p=${val_top_p} \ - actor_rollout_ref.rollout.val_kwargs.top_k=${top_k} \ - actor_rollout_ref.rollout.val_kwargs.do_sample=True \ - actor_rollout_ref.rollout.val_kwargs.n=1 \ - reward_model.reward_manager=dapo \ - +reward_model.reward_kwargs.overlong_buffer_cfg.enable=${enable_overlong_buffer} \ - +reward_model.reward_kwargs.overlong_buffer_cfg.len=${overlong_buffer_len} \ - +reward_model.reward_kwargs.overlong_buffer_cfg.penalty_factor=${overlong_penalty_factor} \ - +reward_model.reward_kwargs.overlong_buffer_cfg.log=False \ - +reward_model.reward_kwargs.max_resp_len=${max_response_length} \ - trainer.logger='["console","tensorboard"]' \ - trainer.project_name="${project_name}" \ - trainer.experiment_name="${exp_name}" \ - trainer.n_gpus_per_node="${NGPUS_PER_NODE}" \ - trainer.nnodes="${NNODES}" \ - trainer.val_before_train=True \ - trainer.test_freq=20 \ - trainer.save_freq=-1 \ - trainer.total_epochs=10 \ - trainer.total_training_steps=400 \ - trainer.default_local_dir="${CKPTS_DIR}" \ - trainer.resume_mode=auto \ - trainer.log_val_generations=10 \ - critic.strategy=megatron \ - actor_rollout_ref.actor.strategy=megatron \ - actor_rollout_ref.actor.megatron.param_offload=${offload} \ - actor_rollout_ref.actor.megatron.optimizer_offload=${offload} \ - actor_rollout_ref.actor.megatron.grad_offload=${offload} \ - actor_rollout_ref.actor.megatron.pipeline_model_parallel_size=${train_pp} \ - actor_rollout_ref.actor.megatron.tensor_model_parallel_size=${train_tp} \ - actor_rollout_ref.actor.megatron.expert_model_parallel_size=${EP} \ - actor_rollout_ref.actor.megatron.expert_tensor_parallel_size=${ETP} \ - actor_rollout_ref.actor.megatron.context_parallel_size=${CP} \ - +actor_rollout_ref.actor.megatron.override_transformer_config.apply_rope_fusion=True \ - +actor_rollout_ref.actor.megatron.override_transformer_config.moe_router_dtype=fp32 \ - +actor_rollout_ref.actor.megatron.override_transformer_config.moe_shared_expert_overlap=False \ - +actor_rollout_ref.actor.megatron.override_transformer_config.moe_enable_deepep=True \ - +actor_rollout_ref.actor.megatron.override_transformer_config.moe_token_dispatcher_type=flex \ - +actor_rollout_ref.actor.megatron.override_transformer_config.moe_permute_fusion=True \ - +actor_rollout_ref.actor.megatron.override_transformer_config.recompute_granularity=selective \ - +actor_rollout_ref.actor.megatron.override_transformer_config.recompute_modules=["core_attn","moe_act","layernorm","mlp","moe"] \ - +actor_rollout_ref.actor.megatron.override_transformer_config.gradient_accumulation_fusion=True \ - +actor_rollout_ref.actor.megatron.override_transformer_config.account_for_embedding_in_pipeline_split=False \ - +actor_rollout_ref.actor.megatron.override_transformer_config.account_for_loss_in_pipeline_split=False \ - actor_rollout_ref.ref.megatron.param_offload=${offload} \ - actor_rollout_ref.ref.megatron.pipeline_model_parallel_size=${train_pp} \ - actor_rollout_ref.ref.megatron.tensor_model_parallel_size=${train_tp} \ - actor_rollout_ref.ref.megatron.expert_model_parallel_size=${EP} \ - actor_rollout_ref.ref.megatron.expert_tensor_parallel_size=${ETP} \ - actor_rollout_ref.ref.megatron.context_parallel_size=${CP} \ - actor_rollout_ref.actor.megatron.use_mbridge=True - - # +actor_rollout_ref.actor.megatron.override_transformer_config.apply_rope_fusion=False \ - # +actor_rollout_ref.actor.megatron.override_transformer_config.moe_router_dtype=fp32 \ - # +actor_rollout_ref.actor.megatron.override_transformer_config.moe_shared_expert_overlap=False \ - # +actor_rollout_ref.actor.megatron.override_transformer_config.moe_enable_deepep=False \ - # +actor_rollout_ref.actor.megatron.override_transformer_config.moe_token_dispatcher_type=alltoall \ - # +actor_rollout_ref.actor.megatron.override_transformer_config.recompute_method=uniform \ - # +actor_rollout_ref.actor.megatron.override_transformer_config.recompute_granularity=full \ - # actor_rollout_ref.actor.megatron.override_transformer_config.recompute_granularity="selective" \ - # actor_rollout_ref.actor.megatron.override_transformer_config.recompute_modules=["core_attn","moe_act","layernorm","mlp","moe"] \ - # +actor_rollout_ref.actor.megatron.override_transformer_config.recompute_num_layers=1 \ - # +actor_rollout_ref.actor.megatron.override_transformer_config.gradient_accumulation_fusion=True \ - # +actor_rollout_ref.actor.megatron.override_transformer_config.moe_permute_fusion=True \ - # +actor_rollout_ref.actor.megatron.override_transformer_config.account_for_embedding_in_pipeline_split=False \ - # +actor_rollout_ref.actor.megatron.override_transformer_config.account_for_loss_in_pipeline_split=False \ - # +actor_rollout_ref.actor.megatron.override_transformer_config.num_layers_in_last_pipeline_stage=${last_layer} \ \ No newline at end of file diff --git a/recipe/fully_async_policy/exp/qwen3-30BA3B_128/megatron_colocate/runtime_env.yaml b/recipe/fully_async_policy/exp/qwen3-30BA3B_128/megatron_colocate/runtime_env.yaml deleted file mode 100644 index 4a714f40f43..00000000000 --- a/recipe/fully_async_policy/exp/qwen3-30BA3B_128/megatron_colocate/runtime_env.yaml +++ /dev/null @@ -1,5 +0,0 @@ -env_vars: - TENSORBOARD_DIR: "/mnt/dolphinfs/ssd_pool/docker/user/hadoop-friday-studio/FTI/houzhenggang/expeirments/qwen3-30BA3B-128/dapo_qwen3-30BA3B_32k_megatron_colocate_128_mbs32" - HYDRA_FULL_ERROR: "1" - TORCH_NCCL_AVOID_RECORD_STREAMS: "1" - CUDA_DEVICE_MAX_CONNECTIONS: "1" \ No newline at end of file diff --git a/recipe/fully_async_policy/exp/qwen3-32B_128/fsdp2_colocate/run.sh b/recipe/fully_async_policy/exp/qwen3-32B_128/fsdp2_colocate/run.sh deleted file mode 100644 index 8f2e636c59f..00000000000 --- a/recipe/fully_async_policy/exp/qwen3-32B_128/fsdp2_colocate/run.sh +++ /dev/null @@ -1,125 +0,0 @@ -#!/usr/bin/env bash -set -xeuo pipefail - -project_name='DAPO' -exp_name='dapo_qwen3-32B-math_32k_fsdp2_colocate_128_mbs32' - -adv_estimator=grpo - -use_kl_in_reward=False -kl_coef=0.0 -use_kl_loss=False -kl_loss_coef=0.0 - -clip_ratio_low=0.2 -clip_ratio_high=0.28 - -max_prompt_length=$((1024 * 2)) -max_response_length=$((1024 * 32)) -enable_overlong_buffer=True -overlong_buffer_len=$((1024 * 4)) -overlong_penalty_factor=1.0 - -loss_agg_mode="token-mean" - -train_prompt_bsz=512 -n_resp_per_prompt=16 -train_prompt_mini_bsz=32 - -# Ray -# RAY_ADDRESS=${RAY_ADDRESS:-"http://localhost:8265"} -# WORKING_DIR=${WORKING_DIR:-"${PWD}"} -# RUNTIME_ENV=${RUNTIME_ENV:-"${WORKING_DIR}/verl/trainer/runtime_env.yaml"} -NNODES=${NNODES:-16} -NGPUS_PER_NODE=${NGPUS_PER_NODE:-8} -# Paths -MODEL_PATH=/mnt/dolphinfs/ssd_pool/docker/user/hadoop-friday-studio/.friday/models/basemodel/Qwen/Qwen3-32B -CKPTS_DIR=./ckpts/${project_name}/${exp_name} -TRAIN_FILE=/mnt/dolphinfs/ssd_pool/docker/user/hadoop-friday-studio/FTI/houzhenggang/data/dapo/dapo-math-17k.parquet -TEST_FILE=/mnt/dolphinfs/ssd_pool/docker/user/hadoop-friday-studio/FTI/houzhenggang/data/dapo/aime-2024.parquet - -# Algorithm -temperature=1.0 -top_p=1.0 -top_k=-1 # 0 for HF rollout, -1 for vLLM rollout -val_top_p=0.7 - -# Performance Related Parameter -sp_size=4 -use_dynamic_bsz=True -actor_ppo_max_token_len=$(((max_prompt_length + max_response_length) * 2)) -infer_ppo_max_token_len=$(((max_prompt_length + max_response_length) * 3)) -offload=True -gen_tp=4 -fsdp_size=32 - -python3 -m verl.trainer.main_ppo \ - data.train_files="${TRAIN_FILE}" \ - data.val_files="${TEST_FILE}" \ - data.prompt_key=prompt \ - data.truncation='left' \ - data.max_prompt_length=${max_prompt_length} \ - data.max_response_length=${max_response_length} \ - data.train_batch_size=${train_prompt_bsz} \ - actor_rollout_ref.rollout.n=${n_resp_per_prompt} \ - algorithm.adv_estimator=${adv_estimator} \ - algorithm.use_kl_in_reward=${use_kl_in_reward} \ - algorithm.kl_ctrl.kl_coef=${kl_coef} \ - actor_rollout_ref.actor.use_kl_loss=${use_kl_loss} \ - actor_rollout_ref.actor.kl_loss_coef=${kl_loss_coef} \ - actor_rollout_ref.actor.clip_ratio_low=${clip_ratio_low} \ - actor_rollout_ref.actor.clip_ratio_high=${clip_ratio_high} \ - actor_rollout_ref.actor.clip_ratio_c=10.0 \ - actor_rollout_ref.model.use_remove_padding=True \ - actor_rollout_ref.actor.use_dynamic_bsz=${use_dynamic_bsz} \ - actor_rollout_ref.ref.log_prob_use_dynamic_bsz=${use_dynamic_bsz} \ - actor_rollout_ref.rollout.log_prob_use_dynamic_bsz=${use_dynamic_bsz} \ - actor_rollout_ref.actor.ppo_max_token_len_per_gpu=${actor_ppo_max_token_len} \ - actor_rollout_ref.ref.log_prob_max_token_len_per_gpu=${infer_ppo_max_token_len} \ - actor_rollout_ref.rollout.log_prob_max_token_len_per_gpu=${infer_ppo_max_token_len} \ - actor_rollout_ref.model.path="${MODEL_PATH}" \ - actor_rollout_ref.model.enable_gradient_checkpointing=True \ - actor_rollout_ref.actor.optim.lr=1e-6 \ - actor_rollout_ref.actor.optim.lr_warmup_steps=10 \ - actor_rollout_ref.actor.optim.weight_decay=0.1 \ - actor_rollout_ref.actor.ppo_mini_batch_size=${train_prompt_mini_bsz} \ - actor_rollout_ref.actor.fsdp_config.param_offload=${offload} \ - actor_rollout_ref.actor.fsdp_config.optimizer_offload=${offload} \ - actor_rollout_ref.actor.entropy_coeff=0 \ - actor_rollout_ref.actor.grad_clip=1.0 \ - actor_rollout_ref.actor.loss_agg_mode=${loss_agg_mode} \ - actor_rollout_ref.actor.ulysses_sequence_parallel_size=${sp_size} \ - actor_rollout_ref.rollout.gpu_memory_utilization=0.80 \ - actor_rollout_ref.rollout.tensor_model_parallel_size=${gen_tp} \ - actor_rollout_ref.rollout.enable_chunked_prefill=True \ - actor_rollout_ref.rollout.max_num_batched_tokens=$((max_prompt_length + max_response_length)) \ - actor_rollout_ref.rollout.temperature=${temperature} \ - actor_rollout_ref.rollout.top_p=${top_p} \ - actor_rollout_ref.rollout.top_k=${top_k} \ - actor_rollout_ref.rollout.val_kwargs.temperature=${temperature} \ - actor_rollout_ref.rollout.val_kwargs.top_p=${val_top_p} \ - actor_rollout_ref.rollout.val_kwargs.top_k=${top_k} \ - actor_rollout_ref.rollout.val_kwargs.do_sample=True \ - actor_rollout_ref.rollout.val_kwargs.n=1 \ - actor_rollout_ref.ref.fsdp_config.param_offload=${offload} \ - actor_rollout_ref.ref.ulysses_sequence_parallel_size=${sp_size} \ - actor_rollout_ref.actor.fsdp_config.fsdp_size=${fsdp_size} \ - reward_model.reward_manager=dapo \ - +reward_model.reward_kwargs.overlong_buffer_cfg.enable=${enable_overlong_buffer} \ - +reward_model.reward_kwargs.overlong_buffer_cfg.len=${overlong_buffer_len} \ - +reward_model.reward_kwargs.overlong_buffer_cfg.penalty_factor=${overlong_penalty_factor} \ - +reward_model.reward_kwargs.overlong_buffer_cfg.log=False \ - +reward_model.reward_kwargs.max_resp_len=${max_response_length} \ - trainer.logger='["console","tensorboard"]' \ - trainer.project_name="${project_name}" \ - trainer.experiment_name="${exp_name}" \ - trainer.n_gpus_per_node="${NGPUS_PER_NODE}" \ - trainer.nnodes="${NNODES}" \ - trainer.val_before_train=True \ - trainer.test_freq=20 \ - trainer.save_freq=-1 \ - trainer.total_epochs=10 \ - trainer.total_training_steps=400 \ - trainer.default_local_dir="${CKPTS_DIR}" \ - trainer.resume_mode=auto \ - trainer.log_val_generations=10 diff --git a/recipe/fully_async_policy/exp/qwen3-32B_128/fsdp2_colocate/runtime_env.yaml b/recipe/fully_async_policy/exp/qwen3-32B_128/fsdp2_colocate/runtime_env.yaml deleted file mode 100644 index 1b4a8ff4b82..00000000000 --- a/recipe/fully_async_policy/exp/qwen3-32B_128/fsdp2_colocate/runtime_env.yaml +++ /dev/null @@ -1,3 +0,0 @@ -env_vars: - TENSORBOARD_DIR: "/mnt/dolphinfs/ssd_pool/docker/user/hadoop-friday-studio/FTI/houzhenggang/expeirments/dapo_qwen3-32B/dapo_qwen3-32B-math_32k_fsdp2_colocate_128_mbs32" - HYDRA_FULL_ERROR: "1" \ No newline at end of file diff --git a/recipe/fully_async_policy/exp/qwen3-32B_128/megatron_colocate/run.sh b/recipe/fully_async_policy/exp/qwen3-32B_128/megatron_colocate/run.sh deleted file mode 100644 index a7535e3575d..00000000000 --- a/recipe/fully_async_policy/exp/qwen3-32B_128/megatron_colocate/run.sh +++ /dev/null @@ -1,156 +0,0 @@ -#!/usr/bin/env bash -set -xeuo pipefail - -project_name='DAPO' -exp_name='dapo_qwen3-32B_32k_megatron_colocate_128_mbs32' - -adv_estimator=grpo - -use_kl_in_reward=False -kl_coef=0.0 -use_kl_loss=False -kl_loss_coef=0.0 - -clip_ratio_low=0.2 -clip_ratio_high=0.28 - -max_prompt_length=$((1024 * 2)) -max_response_length=$((1024 * 32)) -enable_overlong_buffer=True -overlong_buffer_len=$((1024 * 4)) -overlong_penalty_factor=1.0 - -loss_agg_mode="token-mean" - -train_prompt_bsz=512 -train_prompt_mini_bsz=32 -n_resp_per_prompt=16 - -NNODES=${NNODES:-16} -NGPUS_PER_NODE=${NGPUS_PER_NODE:-8} -# Paths -RAY_DATA_HOME=${RAY_DATA_HOME:-"${HOME}/verl"} -MODEL_PATH=/mnt/dolphinfs/ssd_pool/docker/user/hadoop-friday-studio/.friday/models/basemodel/Qwen/Qwen3-32B -CKPTS_DIR=./ckpts/${project_name}/${exp_name} -TRAIN_FILE=/mnt/dolphinfs/ssd_pool/docker/user/hadoop-friday-studio/FTI/houzhenggang/data/dapo/dapo-math-17k.parquet -TEST_FILE=/mnt/dolphinfs/ssd_pool/docker/user/hadoop-friday-studio/FTI/houzhenggang/data/dapo/aime-2024.parquet - -# Algorithm -temperature=1.0 -top_p=1.0 -top_k=-1 # 0 for HF rollout, -1 for vLLM rollout -val_top_p=0.7 - - -# Performance Related Parameter -use_dynamic_bsz=True -actor_ppo_max_token_len=$((max_prompt_length + max_response_length)) -infer_ppo_max_token_len=$((max_prompt_length + max_response_length)) -offload=True -gen_tp=4 -train_tp=4 -train_pp=2 -EP=1 -ETP=1 -CP=1 - -python3 -m verl.trainer.main_ppo \ - --config-path=config \ - --config-name='ppo_megatron_trainer.yaml' \ - data.train_files="${TRAIN_FILE}" \ - data.val_files="${TEST_FILE}" \ - data.prompt_key=prompt \ - data.truncation='left' \ - data.max_prompt_length=${max_prompt_length} \ - data.max_response_length=${max_response_length} \ - data.train_batch_size=${train_prompt_bsz} \ - actor_rollout_ref.rollout.n=${n_resp_per_prompt} \ - algorithm.adv_estimator=${adv_estimator} \ - algorithm.use_kl_in_reward=${use_kl_in_reward} \ - algorithm.kl_ctrl.kl_coef=${kl_coef} \ - actor_rollout_ref.actor.use_kl_loss=${use_kl_loss} \ - actor_rollout_ref.actor.kl_loss_coef=${kl_loss_coef} \ - actor_rollout_ref.actor.clip_ratio_low=${clip_ratio_low} \ - actor_rollout_ref.actor.clip_ratio_high=${clip_ratio_high} \ - actor_rollout_ref.actor.clip_ratio_c=10.0 \ - actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=2 \ - actor_rollout_ref.ref.log_prob_micro_batch_size_per_gpu=4 \ - actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu=4 \ - actor_rollout_ref.model.path="${MODEL_PATH}" \ - actor_rollout_ref.actor.optim.lr=1e-6 \ - actor_rollout_ref.actor.optim.lr_warmup_steps=10 \ - actor_rollout_ref.actor.optim.weight_decay=0.1 \ - actor_rollout_ref.actor.ppo_mini_batch_size=${train_prompt_mini_bsz} \ - actor_rollout_ref.actor.entropy_coeff=0 \ - actor_rollout_ref.actor.optim.clip_grad=1.0 \ - actor_rollout_ref.actor.loss_agg_mode=${loss_agg_mode} \ - actor_rollout_ref.rollout.gpu_memory_utilization=0.80 \ - actor_rollout_ref.rollout.tensor_model_parallel_size=${gen_tp} \ - actor_rollout_ref.rollout.enable_chunked_prefill=True \ - actor_rollout_ref.rollout.max_num_batched_tokens=$((max_prompt_length + max_response_length)) \ - actor_rollout_ref.rollout.temperature=${temperature} \ - actor_rollout_ref.rollout.top_p=${top_p} \ - actor_rollout_ref.rollout.top_k=${top_k} \ - actor_rollout_ref.rollout.val_kwargs.temperature=${temperature} \ - actor_rollout_ref.rollout.val_kwargs.top_p=${val_top_p} \ - actor_rollout_ref.rollout.val_kwargs.top_k=${top_k} \ - actor_rollout_ref.rollout.val_kwargs.do_sample=True \ - actor_rollout_ref.rollout.val_kwargs.n=1 \ - reward_model.reward_manager=dapo \ - +reward_model.reward_kwargs.overlong_buffer_cfg.enable=${enable_overlong_buffer} \ - +reward_model.reward_kwargs.overlong_buffer_cfg.len=${overlong_buffer_len} \ - +reward_model.reward_kwargs.overlong_buffer_cfg.penalty_factor=${overlong_penalty_factor} \ - +reward_model.reward_kwargs.overlong_buffer_cfg.log=False \ - +reward_model.reward_kwargs.max_resp_len=${max_response_length} \ - trainer.logger='["console","tensorboard"]' \ - trainer.project_name="${project_name}" \ - trainer.experiment_name="${exp_name}" \ - trainer.n_gpus_per_node="${NGPUS_PER_NODE}" \ - trainer.nnodes="${NNODES}" \ - trainer.val_before_train=True \ - trainer.test_freq=20 \ - trainer.save_freq=-1 \ - trainer.total_epochs=10 \ - trainer.total_training_steps=400 \ - trainer.default_local_dir="${CKPTS_DIR}" \ - trainer.resume_mode=auto \ - trainer.log_val_generations=10 \ - critic.strategy=megatron \ - actor_rollout_ref.actor.strategy=megatron \ - actor_rollout_ref.actor.megatron.param_offload=${offload} \ - actor_rollout_ref.actor.megatron.optimizer_offload=${offload} \ - actor_rollout_ref.actor.megatron.grad_offload=${offload} \ - actor_rollout_ref.actor.megatron.pipeline_model_parallel_size=${train_pp} \ - actor_rollout_ref.actor.megatron.tensor_model_parallel_size=${train_tp} \ - actor_rollout_ref.actor.megatron.expert_model_parallel_size=${EP} \ - actor_rollout_ref.actor.megatron.expert_tensor_parallel_size=${ETP} \ - actor_rollout_ref.actor.megatron.context_parallel_size=${CP} \ - +actor_rollout_ref.actor.megatron.override_transformer_config.apply_rope_fusion=True \ - +actor_rollout_ref.actor.megatron.override_transformer_config.recompute_granularity=selective \ - +actor_rollout_ref.actor.megatron.override_transformer_config.recompute_modules=["core_attn","layernorm","mlp"] \ - +actor_rollout_ref.actor.megatron.override_transformer_config.gradient_accumulation_fusion=True \ - +actor_rollout_ref.actor.megatron.override_transformer_config.account_for_embedding_in_pipeline_split=False \ - +actor_rollout_ref.actor.megatron.override_transformer_config.account_for_loss_in_pipeline_split=False \ - actor_rollout_ref.ref.megatron.param_offload=${offload} \ - actor_rollout_ref.ref.megatron.pipeline_model_parallel_size=${train_pp} \ - actor_rollout_ref.ref.megatron.tensor_model_parallel_size=${train_tp} \ - actor_rollout_ref.ref.megatron.expert_model_parallel_size=${EP} \ - actor_rollout_ref.ref.megatron.expert_tensor_parallel_size=${ETP} \ - actor_rollout_ref.ref.megatron.context_parallel_size=${CP} \ - actor_rollout_ref.actor.megatron.use_mbridge=True - - # +actor_rollout_ref.actor.megatron.override_transformer_config.apply_rope_fusion=False \ - # +actor_rollout_ref.actor.megatron.override_transformer_config.moe_router_dtype=fp32 \ - # +actor_rollout_ref.actor.megatron.override_transformer_config.moe_shared_expert_overlap=False \ - # +actor_rollout_ref.actor.megatron.override_transformer_config.moe_enable_deepep=False \ - # +actor_rollout_ref.actor.megatron.override_transformer_config.moe_token_dispatcher_type=alltoall \ - # +actor_rollout_ref.actor.megatron.override_transformer_config.recompute_method=uniform \ - # +actor_rollout_ref.actor.megatron.override_transformer_config.recompute_granularity=full \ - # actor_rollout_ref.actor.megatron.override_transformer_config.recompute_granularity="selective" \ - # actor_rollout_ref.actor.megatron.override_transformer_config.recompute_modules=["core_attn","moe_act","layernorm","mlp","moe"] \ - # +actor_rollout_ref.actor.megatron.override_transformer_config.recompute_num_layers=1 \ - # +actor_rollout_ref.actor.megatron.override_transformer_config.gradient_accumulation_fusion=True \ - # +actor_rollout_ref.actor.megatron.override_transformer_config.moe_permute_fusion=True \ - # +actor_rollout_ref.actor.megatron.override_transformer_config.account_for_embedding_in_pipeline_split=False \ - # +actor_rollout_ref.actor.megatron.override_transformer_config.account_for_loss_in_pipeline_split=False \ - # +actor_rollout_ref.actor.megatron.override_transformer_config.num_layers_in_last_pipeline_stage=${last_layer} \ \ No newline at end of file diff --git a/recipe/fully_async_policy/exp/qwen3-32B_128/megatron_colocate/runtime_env.yaml b/recipe/fully_async_policy/exp/qwen3-32B_128/megatron_colocate/runtime_env.yaml deleted file mode 100644 index 1bbc3faadc9..00000000000 --- a/recipe/fully_async_policy/exp/qwen3-32B_128/megatron_colocate/runtime_env.yaml +++ /dev/null @@ -1,5 +0,0 @@ -env_vars: - TENSORBOARD_DIR: "/mnt/dolphinfs/ssd_pool/docker/user/hadoop-friday-studio/FTI/houzhenggang/expeirments/qwen3-32B-128/dapo_qwen3-32B_32k_megatron_colocate_128_mbs32" - HYDRA_FULL_ERROR: "1" - TORCH_NCCL_AVOID_RECORD_STREAMS: "1" - CUDA_DEVICE_MAX_CONNECTIONS: "1" \ No newline at end of file diff --git a/recipe/fully_async_policy/fully_async_main.py b/recipe/fully_async_policy/fully_async_main.py index b98b3f426e0..2f4ab8ccc6b 100644 --- a/recipe/fully_async_policy/fully_async_main.py +++ b/recipe/fully_async_policy/fully_async_main.py @@ -175,10 +175,6 @@ def _initialize_components(self, config) -> None: print("[ASYNC MAIN] Creating FullyAsyncTrainer...") self._create_trainer(config) - # sync require samples between rollouter and trainer - required_samples = ray.get(self.components["trainer"].get_required_samples.remote()) - ray.get(self.components["rollouter"].set_required_samples.remote(required_samples)) - # sync total_train_steps between rollouter and trainer total_train_steps = ray.get(self.components["rollouter"].get_total_train_steps.remote()) print(f"total_train_steps {total_train_steps}") @@ -228,6 +224,8 @@ def _create_rollouter(self, config) -> None: ) ray.get(rollouter.init_workers.remote()) + ray.get(rollouter.set_max_required_samples.remote()) + self.components["rollouter"] = rollouter print("[ASYNC MAIN] Rollouter created and initialized successfully") diff --git a/recipe/fully_async_policy/fully_async_rollouter.py b/recipe/fully_async_policy/fully_async_rollouter.py index d10c0684be4..ed6a279ed25 100644 --- a/recipe/fully_async_policy/fully_async_rollouter.py +++ b/recipe/fully_async_policy/fully_async_rollouter.py @@ -115,7 +115,8 @@ def __init__( # Config self.staleness_threshold: float = config.async_training.get("staleness_threshold", 1) - self.required_samples = None + # required_samples use ppo_mini_batch_size as the minimum number of samples. + self.required_samples = config.actor_rollout_ref.actor.ppo_mini_batch_size self.max_required_samples = None # 单次最多扔一次更新需要的样本 self.max_concurrent_samples = None @@ -153,9 +154,8 @@ async def set_message_queue_client(self, message_queue_client: MessageQueueClien async with self.lock: self.message_queue_client = message_queue_client - async def set_required_samples(self, required_samples: int): + async def set_max_required_samples(self): async with self.lock: - self.required_samples = int(required_samples) self.max_required_samples = int( self.required_samples * (self.staleness_threshold + 1) diff --git a/recipe/fully_async_policy/fully_async_trainer.py b/recipe/fully_async_policy/fully_async_trainer.py index a4c59c33701..b20ca764a7a 100644 --- a/recipe/fully_async_policy/fully_async_trainer.py +++ b/recipe/fully_async_policy/fully_async_trainer.py @@ -97,7 +97,7 @@ def __init__( self.progress_bar = None self.trigger_parameter_sync_step = config.async_training.trigger_parameter_sync_step - # calculate required_samples + # required_samples use ppo_mini_batch_size as the minimum number of samples. self.required_samples = config.actor_rollout_ref.actor.ppo_mini_batch_size total_gpus = ( config.trainer.nnodes * config.trainer.n_gpus_per_node @@ -121,9 +121,6 @@ def get_actor_wg(self): """Get actor worker group""" return self.actor_wg - def get_required_samples(self): - return self.required_samples - def _get_samples_from_queue(self) -> tuple[None, None] | tuple[int, Any]: """ Get samples from message queue and compose gen_batch_output diff --git a/recipe/fully_async_policy/exp/qwen2-7B-math_128/fsdp2_fully-async_64-64_stal0.1/run.sh b/recipe/fully_async_policy/shell/dapo-32B_fsdp2_64_64.sh similarity index 91% rename from recipe/fully_async_policy/exp/qwen2-7B-math_128/fsdp2_fully-async_64-64_stal0.1/run.sh rename to recipe/fully_async_policy/shell/dapo-32B_fsdp2_64_64.sh index e9133e50eac..324a7d9470e 100644 --- a/recipe/fully_async_policy/exp/qwen2-7B-math_128/fsdp2_fully-async_64-64_stal0.1/run.sh +++ b/recipe/fully_async_policy/shell/dapo-32B_fsdp2_64_64.sh @@ -2,7 +2,7 @@ set -xeuo pipefail project_name='DAPO' -exp_name='dapo_qwen2-7B-math_28k_fsdp2_fully-async_64-64_mbs32_tpf4_stal0.1' +exp_name='dapo_qwen2-32B_20k_fsdp2_fully-async_64-64' # Ray # RAY_ADDRESS=${RAY_ADDRESS:-"http://localhost:8265"} @@ -16,11 +16,8 @@ CKPTS_DIR=${CKPTS_DIR:-"${RAY_DATA_HOME}/ckpts/${project_name}/${exp_name}"} TRAIN_FILE=${TRAIN_FILE:-"${RAY_DATA_HOME}/data/dapo-math-17k.parquet"} TEST_FILE=${TEST_FILE:-"${RAY_DATA_HOME}/data/aime-2024.parquet"} -# MODEL_PATH=/mnt/dolphinfs/hdd_pool/docker/user/hadoop-djst-algoplat/houzhenggang/modelscope/hub/models/Qwen/Qwen2___5-Math-7B MODEL_PATH=/mnt/dolphinfs/ssd_pool/docker/user/hadoop-friday-studio/FTI/houzhenggang/model/Qwen2___5-Math-7B CKPTS_DIR=./ckpts/${project_name}/${exp_name} -# TRAIN_FILE=/mnt/dolphinfs/hdd_pool/docker/user/hadoop-djst-algoplat/houzhenggang/data/dapo/dapo-math-17k.parquet -# TEST_FILE=/mnt/dolphinfs/hdd_pool/docker/user/hadoop-djst-algoplat/houzhenggang/data/dapo/aime-2024.parquet TRAIN_FILE=/mnt/dolphinfs/ssd_pool/docker/user/hadoop-friday-studio/FTI/houzhenggang/data/dapo/dapo-math-17k.parquet TEST_FILE=/mnt/dolphinfs/ssd_pool/docker/user/hadoop-friday-studio/FTI/houzhenggang/data/dapo/aime-2024.parquet @@ -44,7 +41,7 @@ clip_ratio_high=0.28 # Response length parameters max_prompt_length=$((1024 * 2)) -max_response_length=$((1024 * 28)) +max_response_length=$((1024 * 20)) enable_overlong_buffer=True overlong_buffer_len=$((1024 * 4)) overlong_penalty_factor=1.0 @@ -65,24 +62,23 @@ infer_ppo_max_token_len=$(((max_prompt_length + max_response_length) * 3)) ref_offload=True actor_offload=False gen_tp=4 -sp_size=4 -fsdp_size=2 +sp_size=8 +fsdp_size=-1 # Fully async specific parameters NNODES_ROLLOUT=${NNODES_ROLLOUT:-8} NNODES_TRAIN=${NNODES_TRAIN:-8} NGPUS_PER_NODE=${NGPUS_PER_NODE:-8} - train_prompt_bsz=0 gen_prompt_bsz=1 n_resp_per_prompt=16 train_prompt_mini_bsz=32 -total_rollout_steps=$(((512*400))) +total_rollout_steps=$(((512*200))) test_freq=20 -staleness_threshold=0.1 -trigger_parameter_sync_step=4 -partial_rollout=True +staleness_threshold=0 +trigger_parameter_sync_step=16 +partial_rollout=False python -m recipe.fully_async_policy.fully_async_main \ data.train_files="${TRAIN_FILE}" \ diff --git a/recipe/fully_async_policy/shell/dapo_7b_math_fsdp2_4_12.sh b/recipe/fully_async_policy/shell/dapo_7b_math_fsdp2_4_12.sh index fc9b2ad6607..f560468a4cf 100644 --- a/recipe/fully_async_policy/shell/dapo_7b_math_fsdp2_4_12.sh +++ b/recipe/fully_async_policy/shell/dapo_7b_math_fsdp2_4_12.sh @@ -2,7 +2,7 @@ set -xeuo pipefail project_name='DAPO' -exp_name='DAPO-Qwen2.5-7b-MATH-0527a1-fsdp2-fully-async-8-8' +exp_name='DAPO-Qwen2.5-7b-MATH-0527a1-fsdp2-fully-async-4-12' # Ray # RAY_ADDRESS=${RAY_ADDRESS:-"http://localhost:8265"} @@ -16,10 +16,10 @@ CKPTS_DIR=${CKPTS_DIR:-"${RAY_DATA_HOME}/ckpts/${project_name}/${exp_name}"} TRAIN_FILE=${TRAIN_FILE:-"${RAY_DATA_HOME}/data/dapo-math-17k.parquet"} TEST_FILE=${TEST_FILE:-"${RAY_DATA_HOME}/data/aime-2024.parquet"} -MODEL_PATH=/mnt/dolphinfs/hdd_pool/docker/user/hadoop-djst-algoplat/houzhenggang/modelscope/hub/models/Qwen/Qwen2___5-Math-7B +MODEL_PATH=/mnt/dolphinfs/ssd_pool/docker/user/hadoop-friday-studio/FTI/houzhenggang/model/Qwen2___5-Math-7B CKPTS_DIR=./ckpts/${project_name}/${exp_name} -TRAIN_FILE=/mnt/dolphinfs/hdd_pool/docker/user/hadoop-djst-algoplat/houzhenggang/data/dapo/dapo-math-17k.parquet -TEST_FILE=/mnt/dolphinfs/hdd_pool/docker/user/hadoop-djst-algoplat/houzhenggang/data/dapo/aime-2024.parquet +TRAIN_FILE=/mnt/dolphinfs/ssd_pool/docker/user/hadoop-friday-studio/FTI/houzhenggang/data/dapo/dapo-math-17k.parquet +TEST_FILE=/mnt/dolphinfs/ssd_pool/docker/user/hadoop-friday-studio/FTI/houzhenggang/data/dapo/aime-2024.parquet rollout_mode="async" rollout_name="vllm" # sglang or vllm @@ -75,7 +75,7 @@ n_gpus_training=$((NGPUS_PER_NODE - n_gpus_rollout)) train_prompt_bsz=0 gen_prompt_bsz=1 n_resp_per_prompt=16 -train_prompt_mini_bsz=64 +train_prompt_mini_bsz=32 total_rollout_steps=$(((512*100))) test_freq=10 staleness_threshold=0.1 diff --git a/recipe/fully_async_policy/shell/dapo_7b_math_fsdp2_2_6.sh b/recipe/fully_async_policy/shell/dapo_7b_math_fsdp2_4_4.sh similarity index 94% rename from recipe/fully_async_policy/shell/dapo_7b_math_fsdp2_2_6.sh rename to recipe/fully_async_policy/shell/dapo_7b_math_fsdp2_4_4.sh index 10563218878..ef00feb9d05 100644 --- a/recipe/fully_async_policy/shell/dapo_7b_math_fsdp2_2_6.sh +++ b/recipe/fully_async_policy/shell/dapo_7b_math_fsdp2_4_4.sh @@ -16,8 +16,12 @@ CKPTS_DIR=${CKPTS_DIR:-"${RAY_DATA_HOME}/ckpts/${project_name}/${exp_name}"} TRAIN_FILE=${TRAIN_FILE:-"${RAY_DATA_HOME}/data/dapo-math-17k.parquet"} TEST_FILE=${TEST_FILE:-"${RAY_DATA_HOME}/data/aime-2024.parquet"} -MODEL_PATH=/mnt/dolphinfs/hdd_pool/docker/user/hadoop-djst-algoplat/houzhenggang/modelscope/hub/models/Qwen/Qwen2___5-Math-7B +MODEL_PATH=/mnt/dolphinfs/ssd_pool/docker/user/hadoop-friday-studio/FTI/houzhenggang/model/Qwen2___5-Math-7B CKPTS_DIR=./ckpts/${project_name}/${exp_name} +TRAIN_FILE=/mnt/dolphinfs/ssd_pool/docker/user/hadoop-friday-studio/FTI/houzhenggang/data/dapo/dapo-math-17k.parquet +TEST_FILE=/mnt/dolphinfs/ssd_pool/docker/user/hadoop-friday-studio/FTI/houzhenggang/data/dapo/aime-2024.parquet + +MODEL_PATH=/mnt/dolphinfs/hdd_pool/docker/user/hadoop-djst-algoplat/houzhenggang/modelscope/hub/models/Qwen/Qwen2___5-Math-7B TRAIN_FILE=/mnt/dolphinfs/hdd_pool/docker/user/hadoop-djst-algoplat/houzhenggang/data/dapo/dapo-math-17k.parquet TEST_FILE=/mnt/dolphinfs/hdd_pool/docker/user/hadoop-djst-algoplat/houzhenggang/data/dapo/aime-2024.parquet @@ -65,19 +69,19 @@ gen_tp=1 sp_size=1 fsdp_size=2 +# Fully async specific parameters NNODES=${NNODES:-1} NGPUS_PER_NODE=${NGPUS_PER_NODE:-8} -# Fully async specific parameters -n_gpus_rollout=2 +n_gpus_rollout=4 n_gpus_training=$((NGPUS_PER_NODE - n_gpus_rollout)) train_prompt_bsz=0 gen_prompt_bsz=1 n_resp_per_prompt=16 -train_prompt_mini_bsz=64 +train_prompt_mini_bsz=32 total_rollout_steps=$(((512*100))) -test_freq=2 +test_freq=10 staleness_threshold=0.1 trigger_parameter_sync_step=16 partial_rollout=True diff --git a/recipe/fully_async_policy/exp/qwen2-7B-math_128/fsdp2_fully-async_64-64/run.sh b/recipe/fully_async_policy/shell/dapo_7b_math_fsdp2_64_64.sh similarity index 93% rename from recipe/fully_async_policy/exp/qwen2-7B-math_128/fsdp2_fully-async_64-64/run.sh rename to recipe/fully_async_policy/shell/dapo_7b_math_fsdp2_64_64.sh index 03ebab25cea..1d1958fda79 100644 --- a/recipe/fully_async_policy/exp/qwen2-7B-math_128/fsdp2_fully-async_64-64/run.sh +++ b/recipe/fully_async_policy/shell/dapo_7b_math_fsdp2_64_64.sh @@ -2,7 +2,7 @@ set -xeuo pipefail project_name='DAPO' -exp_name='dapo_qwen2-7B-math_28k_fsdp2_fully-async_64-64_mbs32_tpf4_fixmcs' +exp_name='dapo_qwen2-7B-math_28k_fsdp2_fully-async_64-64' # Ray # RAY_ADDRESS=${RAY_ADDRESS:-"http://localhost:8265"} @@ -16,11 +16,8 @@ CKPTS_DIR=${CKPTS_DIR:-"${RAY_DATA_HOME}/ckpts/${project_name}/${exp_name}"} TRAIN_FILE=${TRAIN_FILE:-"${RAY_DATA_HOME}/data/dapo-math-17k.parquet"} TEST_FILE=${TEST_FILE:-"${RAY_DATA_HOME}/data/aime-2024.parquet"} -# MODEL_PATH=/mnt/dolphinfs/hdd_pool/docker/user/hadoop-djst-algoplat/houzhenggang/modelscope/hub/models/Qwen/Qwen2___5-Math-7B MODEL_PATH=/mnt/dolphinfs/ssd_pool/docker/user/hadoop-friday-studio/FTI/houzhenggang/model/Qwen2___5-Math-7B CKPTS_DIR=./ckpts/${project_name}/${exp_name} -# TRAIN_FILE=/mnt/dolphinfs/hdd_pool/docker/user/hadoop-djst-algoplat/houzhenggang/data/dapo/dapo-math-17k.parquet -# TEST_FILE=/mnt/dolphinfs/hdd_pool/docker/user/hadoop-djst-algoplat/houzhenggang/data/dapo/aime-2024.parquet TRAIN_FILE=/mnt/dolphinfs/ssd_pool/docker/user/hadoop-friday-studio/FTI/houzhenggang/data/dapo/dapo-math-17k.parquet TEST_FILE=/mnt/dolphinfs/ssd_pool/docker/user/hadoop-friday-studio/FTI/houzhenggang/data/dapo/aime-2024.parquet @@ -73,15 +70,14 @@ NNODES_ROLLOUT=${NNODES_ROLLOUT:-8} NNODES_TRAIN=${NNODES_TRAIN:-8} NGPUS_PER_NODE=${NGPUS_PER_NODE:-8} - train_prompt_bsz=0 gen_prompt_bsz=1 n_resp_per_prompt=16 train_prompt_mini_bsz=32 total_rollout_steps=$(((512*400))) -test_freq=20 +test_freq=10 staleness_threshold=0.1 -trigger_parameter_sync_step=4 +trigger_parameter_sync_step=16 partial_rollout=True python -m recipe.fully_async_policy.fully_async_main \ diff --git a/recipe/fully_async_policy/shell/dapo_7b_math_fsdp2_8_8.sh b/recipe/fully_async_policy/shell/dapo_7b_math_fsdp2_8_8.sh index c59877d97f9..85cdaa03fc5 100644 --- a/recipe/fully_async_policy/shell/dapo_7b_math_fsdp2_8_8.sh +++ b/recipe/fully_async_policy/shell/dapo_7b_math_fsdp2_8_8.sh @@ -16,10 +16,10 @@ CKPTS_DIR=${CKPTS_DIR:-"${RAY_DATA_HOME}/ckpts/${project_name}/${exp_name}"} TRAIN_FILE=${TRAIN_FILE:-"${RAY_DATA_HOME}/data/dapo-math-17k.parquet"} TEST_FILE=${TEST_FILE:-"${RAY_DATA_HOME}/data/aime-2024.parquet"} -MODEL_PATH=/mnt/dolphinfs/hdd_pool/docker/user/hadoop-djst-algoplat/houzhenggang/modelscope/hub/models/Qwen/Qwen2___5-Math-7B +MODEL_PATH=/mnt/dolphinfs/ssd_pool/docker/user/hadoop-friday-studio/FTI/houzhenggang/model/Qwen2___5-Math-7B CKPTS_DIR=./ckpts/${project_name}/${exp_name} -TRAIN_FILE=/mnt/dolphinfs/hdd_pool/docker/user/hadoop-djst-algoplat/houzhenggang/data/dapo/dapo-math-17k.parquet -TEST_FILE=/mnt/dolphinfs/hdd_pool/docker/user/hadoop-djst-algoplat/houzhenggang/data/dapo/aime-2024.parquet +TRAIN_FILE=/mnt/dolphinfs/ssd_pool/docker/user/hadoop-friday-studio/FTI/houzhenggang/data/dapo/dapo-math-17k.parquet +TEST_FILE=/mnt/dolphinfs/ssd_pool/docker/user/hadoop-friday-studio/FTI/houzhenggang/data/dapo/aime-2024.parquet rollout_mode="async" rollout_name="vllm" # sglang or vllm @@ -65,17 +65,19 @@ gen_tp=1 sp_size=1 fsdp_size=2 -NNODES=${NNODES:-1} -NGPUS_PER_NODE=${NGPUS_PER_NODE:-8} # Fully async specific parameters +NNODES_ROLLOUT=${NNODES_ROLLOUT:-1} +NNODES_TRAIN=${NNODES_TRAIN:-1} +NGPUS_PER_NODE=${NGPUS_PER_NODE:-8} + n_gpus_rollout=8 n_gpus_training=8 train_prompt_bsz=0 gen_prompt_bsz=1 n_resp_per_prompt=16 -train_prompt_mini_bsz=64 +train_prompt_mini_bsz=32 total_rollout_steps=$(((512*100))) test_freq=10 staleness_threshold=0.1 @@ -159,10 +161,10 @@ $PYTHON_INTERPRETER -m recipe.fully_async_policy.fully_async_main \ trainer.save_freq=-1 \ trainer.default_local_dir="${CKPTS_DIR}" \ trainer.resume_mode=auto \ - trainer.nnodes="${NNODES}" \ - trainer.n_gpus_per_node="${n_gpus_training}" \ - rollout.nnodes="${NNODES}" \ - rollout.n_gpus_per_node="${n_gpus_rollout}" \ + trainer.nnodes="${NNODES_TRAIN}" \ + trainer.n_gpus_per_node="${NGPUS_PER_NODE}" \ + rollout.nnodes="${NNODES_ROLLOUT}" \ + rollout.n_gpus_per_node="${NGPUS_PER_NODE}" \ rollout.total_rollout_steps="${total_rollout_steps}" \ rollout.total_epochs=10 \ rollout.test_freq="${test_freq}" \ diff --git a/recipe/fully_async_policy/shell/dapo_7b_math_fsdp2_colocate.sh b/recipe/fully_async_policy/shell/dapo_7b_math_fsdp2_colocate.sh deleted file mode 100644 index 33f9836e095..00000000000 --- a/recipe/fully_async_policy/shell/dapo_7b_math_fsdp2_colocate.sh +++ /dev/null @@ -1,141 +0,0 @@ -#!/usr/bin/env bash -set -xeuo pipefail - -project_name='DAPO' -exp_name='DAPO-Qwen2.5-7b-MATH-0527a1-fsdp2-colocate' - -adv_estimator=grpo - -use_kl_in_reward=False -kl_coef=0.0 -use_kl_loss=False -kl_loss_coef=0.0 - -clip_ratio_low=0.2 -clip_ratio_high=0.28 - -max_prompt_length=$((1024 * 2)) -max_response_length=$((1024 * 8)) -enable_overlong_buffer=True -overlong_buffer_len=$((1024 * 4)) -overlong_penalty_factor=1.0 - -loss_agg_mode="token-mean" - -train_prompt_bsz=512 -n_resp_per_prompt=16 -train_prompt_mini_bsz=32 - -# Ray -# RAY_ADDRESS=${RAY_ADDRESS:-"http://localhost:8265"} -# WORKING_DIR=${WORKING_DIR:-"${PWD}"} -# RUNTIME_ENV=${RUNTIME_ENV:-"${WORKING_DIR}/verl/trainer/runtime_env.yaml"} -NNODES=${NNODES:-2} -NGPUS_PER_NODE=${NGPUS_PER_NODE:-8} -# Paths -RAY_DATA_HOME=${RAY_DATA_HOME:-"${HOME}/verl"} -# very important! please modify the max_position_embeddings in config.json to 32768 after downloading from huggingface -MODEL_PATH=${MODEL_PATH:-"${RAY_DATA_HOME}/models/Qwen2.5-Math-7B"} -CKPTS_DIR=${CKPTS_DIR:-"${RAY_DATA_HOME}/ckpts/${project_name}/${exp_name}"} -TRAIN_FILE=${TRAIN_FILE:-"${RAY_DATA_HOME}/data/dapo-math-17k.parquet"} -TEST_FILE=${TEST_FILE:-"${RAY_DATA_HOME}/data/aime-2024.parquet"} - -MODEL_PATH=/mnt/dolphinfs/hdd_pool/docker/user/hadoop-djst-algoplat/houzhenggang/modelscope/hub/models/Qwen/Qwen2___5-Math-7B -CKPTS_DIR=./ckpts/${project_name}/${exp_name} -TRAIN_FILE=/mnt/dolphinfs/hdd_pool/docker/user/hadoop-djst-algoplat/houzhenggang/data/dapo/dapo-math-17k.parquet -TEST_FILE=/mnt/dolphinfs/hdd_pool/docker/user/hadoop-djst-algoplat/houzhenggang/data/dapo/aime-2024.parquet -# Algorithm -temperature=1.0 -top_p=1.0 -top_k=-1 # 0 for HF rollout, -1 for vLLM rollout -val_top_p=0.7 - -# Performance Related Parameter -use_dynamic_bsz=True -actor_ppo_max_token_len=$(((max_prompt_length + max_response_length) * 2)) -infer_ppo_max_token_len=$(((max_prompt_length + max_response_length) * 3)) -offload=True -gen_tp=2 -sp_size=4 -fsdp_size=2 - -# reference run wandb: https://wandb.ai/verl-org/DAPO%20Reproduction%20on%20verl/runs/ow47vvon?nw=nwusertongyuxuan361 - -PYTHON_INTERPRETER="/home/hadoop-djst-algoplat/miniconda3/bin/python" -if [ ! -x "$PYTHON_INTERPRETER" ]; then - PYTHON_INTERPRETER="python3" -fi - -$PYTHON_INTERPRETER -m verl.trainer.main_ppo \ - data.train_files="${TRAIN_FILE}" \ - data.val_files="${TEST_FILE}" \ - data.prompt_key=prompt \ - data.truncation='left' \ - data.max_prompt_length=${max_prompt_length} \ - data.max_response_length=${max_response_length} \ - data.train_batch_size=${train_prompt_bsz} \ - actor_rollout_ref.rollout.n=${n_resp_per_prompt} \ - algorithm.adv_estimator=${adv_estimator} \ - algorithm.use_kl_in_reward=${use_kl_in_reward} \ - algorithm.kl_ctrl.kl_coef=${kl_coef} \ - actor_rollout_ref.actor.strategy=fsdp2 \ - critic.strategy=fsdp2 \ - actor_rollout_ref.actor.use_kl_loss=${use_kl_loss} \ - actor_rollout_ref.actor.kl_loss_coef=${kl_loss_coef} \ - actor_rollout_ref.actor.clip_ratio_low=${clip_ratio_low} \ - actor_rollout_ref.actor.clip_ratio_high=${clip_ratio_high} \ - actor_rollout_ref.actor.clip_ratio_c=10.0 \ - actor_rollout_ref.model.use_remove_padding=True \ - +actor_rollout_ref.model.override_config.max_position_embeddings=32768 \ - actor_rollout_ref.actor.use_dynamic_bsz=${use_dynamic_bsz} \ - actor_rollout_ref.ref.log_prob_use_dynamic_bsz=${use_dynamic_bsz} \ - actor_rollout_ref.rollout.log_prob_use_dynamic_bsz=${use_dynamic_bsz} \ - actor_rollout_ref.actor.ppo_max_token_len_per_gpu=${actor_ppo_max_token_len} \ - actor_rollout_ref.ref.log_prob_max_token_len_per_gpu=${infer_ppo_max_token_len} \ - actor_rollout_ref.rollout.log_prob_max_token_len_per_gpu=${infer_ppo_max_token_len} \ - actor_rollout_ref.model.path="${MODEL_PATH}" \ - actor_rollout_ref.model.enable_gradient_checkpointing=True \ - actor_rollout_ref.actor.optim.lr=1e-6 \ - actor_rollout_ref.actor.optim.lr_warmup_steps=10 \ - actor_rollout_ref.actor.optim.weight_decay=0.1 \ - actor_rollout_ref.actor.ppo_mini_batch_size=${train_prompt_mini_bsz} \ - actor_rollout_ref.actor.fsdp_config.param_offload=${offload} \ - actor_rollout_ref.actor.fsdp_config.optimizer_offload=${offload} \ - actor_rollout_ref.actor.entropy_coeff=0 \ - actor_rollout_ref.actor.grad_clip=1.0 \ - actor_rollout_ref.actor.loss_agg_mode=${loss_agg_mode} \ - actor_rollout_ref.actor.ulysses_sequence_parallel_size=${sp_size} \ - actor_rollout_ref.rollout.gpu_memory_utilization=0.80 \ - actor_rollout_ref.rollout.tensor_model_parallel_size=${gen_tp} \ - actor_rollout_ref.rollout.enable_chunked_prefill=True \ - actor_rollout_ref.rollout.max_num_batched_tokens=$((max_prompt_length + max_response_length)) \ - actor_rollout_ref.rollout.temperature=${temperature} \ - actor_rollout_ref.rollout.top_p=${top_p} \ - actor_rollout_ref.rollout.top_k=${top_k} \ - actor_rollout_ref.rollout.val_kwargs.temperature=${temperature} \ - actor_rollout_ref.rollout.val_kwargs.top_p=${val_top_p} \ - actor_rollout_ref.rollout.val_kwargs.top_k=${top_k} \ - actor_rollout_ref.rollout.val_kwargs.do_sample=True \ - actor_rollout_ref.rollout.val_kwargs.n=1 \ - actor_rollout_ref.ref.fsdp_config.param_offload=${offload} \ - actor_rollout_ref.ref.ulysses_sequence_parallel_size=${sp_size} \ - actor_rollout_ref.actor.fsdp_config.fsdp_size=${fsdp_size} \ - reward_model.reward_manager=dapo \ - +reward_model.reward_kwargs.overlong_buffer_cfg.enable=${enable_overlong_buffer} \ - +reward_model.reward_kwargs.overlong_buffer_cfg.len=${overlong_buffer_len} \ - +reward_model.reward_kwargs.overlong_buffer_cfg.penalty_factor=${overlong_penalty_factor} \ - +reward_model.reward_kwargs.overlong_buffer_cfg.log=False \ - +reward_model.reward_kwargs.max_resp_len=${max_response_length} \ - trainer.logger=['console','tensorboard'] \ - trainer.project_name="${project_name}" \ - trainer.experiment_name="${exp_name}" \ - trainer.n_gpus_per_node="${NGPUS_PER_NODE}" \ - trainer.nnodes="${NNODES}" \ - trainer.val_before_train=True \ - trainer.test_freq=10 \ - trainer.save_freq=-1 \ - trainer.total_epochs=10 \ - trainer.total_training_steps=100 \ - trainer.default_local_dir="${CKPTS_DIR}" \ - trainer.resume_mode=auto \ - trainer.log_val_generations=10 diff --git a/recipe/fully_async_policy/shell/dapo_7b_math_fsdp2_server.sh b/recipe/fully_async_policy/shell/dapo_7b_math_fsdp2_server.sh deleted file mode 100644 index 087dea05121..00000000000 --- a/recipe/fully_async_policy/shell/dapo_7b_math_fsdp2_server.sh +++ /dev/null @@ -1,148 +0,0 @@ -#!/usr/bin/env bash -set -xeuo pipefail - -project_name='DAPO' -exp_name='DAPO-Qwen2.5-7b-MATH-0527a1-fsdp2-server' - - -rollout_mode="async" -rollout_name="vllm" # sglang or vllm -if [ "$rollout_mode" = "async" ]; then - export VLLM_USE_V1=1 - return_raw_chat="True" -fi - - -adv_estimator=grpo - -use_kl_in_reward=False -kl_coef=0.0 -use_kl_loss=False -kl_loss_coef=0.0 - -clip_ratio_low=0.2 -clip_ratio_high=0.28 - -max_prompt_length=$((1024 * 2)) -max_response_length=$((1024 * 8)) -enable_overlong_buffer=True -overlong_buffer_len=$((1024 * 4)) -overlong_penalty_factor=1.0 - -loss_agg_mode="token-mean" - -train_prompt_bsz=512 -n_resp_per_prompt=16 -train_prompt_mini_bsz=32 - -# Ray -# RAY_ADDRESS=${RAY_ADDRESS:-"http://localhost:8265"} -# WORKING_DIR=${WORKING_DIR:-"${PWD}"} -# RUNTIME_ENV=${RUNTIME_ENV:-"${WORKING_DIR}/verl/trainer/runtime_env.yaml"} -NNODES=${NNODES:-1} -NGPUS_PER_NODE=${NGPUS_PER_NODE:-8} -# Paths -RAY_DATA_HOME=${RAY_DATA_HOME:-"${HOME}/verl"} -# very important! please modify the max_position_embeddings in config.json to 32768 after downloading from huggingface -MODEL_PATH=${MODEL_PATH:-"${RAY_DATA_HOME}/models/Qwen2.5-Math-7B"} -CKPTS_DIR=${CKPTS_DIR:-"${RAY_DATA_HOME}/ckpts/${project_name}/${exp_name}"} -TRAIN_FILE=${TRAIN_FILE:-"${RAY_DATA_HOME}/data/dapo-math-17k.parquet"} -TEST_FILE=${TEST_FILE:-"${RAY_DATA_HOME}/data/aime-2024.parquet"} - -MODEL_PATH=/mnt/dolphinfs/hdd_pool/docker/user/hadoop-djst-algoplat/houzhenggang/modelscope/hub/models/Qwen/Qwen2___5-Math-7B -CKPTS_DIR=./ckpts/${project_name}/${exp_name} -TRAIN_FILE=/mnt/dolphinfs/hdd_pool/docker/user/hadoop-djst-algoplat/houzhenggang/data/dapo/dapo-math-17k.parquet -TEST_FILE=/mnt/dolphinfs/hdd_pool/docker/user/hadoop-djst-algoplat/houzhenggang/data/dapo/aime-2024.parquet -# Algorithm -temperature=1.0 -top_p=1.0 -top_k=-1 # 0 for HF rollout, -1 for vLLM rollout -val_top_p=0.7 - -# Performance Related Parameter -use_dynamic_bsz=True -actor_ppo_max_token_len=$(((max_prompt_length + max_response_length) * 2)) -infer_ppo_max_token_len=$(((max_prompt_length + max_response_length) * 3)) -offload=True -gen_tp=1 -sp_size=1 -fsdp_size=2 - -# reference run wandb: https://wandb.ai/verl-org/DAPO%20Reproduction%20on%20verl/runs/ow47vvon?nw=nwusertongyuxuan361 - -/home/hadoop-djst-algoplat/miniconda3/bin/python -m verl.trainer.main_ppo \ - data.train_files="${TRAIN_FILE}" \ - data.val_files="${TEST_FILE}" \ - data.prompt_key=prompt \ - data.truncation='left' \ - data.max_prompt_length=${max_prompt_length} \ - data.max_response_length=${max_response_length} \ - data.train_batch_size=${train_prompt_bsz} \ - data.return_raw_chat=${return_raw_chat} \ - actor_rollout_ref.rollout.n=${n_resp_per_prompt} \ - algorithm.adv_estimator=${adv_estimator} \ - algorithm.use_kl_in_reward=${use_kl_in_reward} \ - algorithm.kl_ctrl.kl_coef=${kl_coef} \ - actor_rollout_ref.actor.strategy=fsdp2 \ - critic.strategy=fsdp2 \ - actor_rollout_ref.actor.use_kl_loss=${use_kl_loss} \ - actor_rollout_ref.actor.kl_loss_coef=${kl_loss_coef} \ - actor_rollout_ref.actor.clip_ratio_low=${clip_ratio_low} \ - actor_rollout_ref.actor.clip_ratio_high=${clip_ratio_high} \ - actor_rollout_ref.actor.clip_ratio_c=10.0 \ - actor_rollout_ref.model.use_remove_padding=True \ - +actor_rollout_ref.model.override_config.max_position_embeddings=32768 \ - actor_rollout_ref.actor.use_dynamic_bsz=${use_dynamic_bsz} \ - actor_rollout_ref.ref.log_prob_use_dynamic_bsz=${use_dynamic_bsz} \ - actor_rollout_ref.rollout.log_prob_use_dynamic_bsz=${use_dynamic_bsz} \ - actor_rollout_ref.actor.ppo_max_token_len_per_gpu=${actor_ppo_max_token_len} \ - actor_rollout_ref.ref.log_prob_max_token_len_per_gpu=${infer_ppo_max_token_len} \ - actor_rollout_ref.rollout.log_prob_max_token_len_per_gpu=${infer_ppo_max_token_len} \ - actor_rollout_ref.model.path="${MODEL_PATH}" \ - actor_rollout_ref.model.enable_gradient_checkpointing=True \ - actor_rollout_ref.actor.optim.lr=1e-6 \ - actor_rollout_ref.actor.optim.lr_warmup_steps=10 \ - actor_rollout_ref.actor.optim.weight_decay=0.1 \ - actor_rollout_ref.actor.ppo_mini_batch_size=${train_prompt_mini_bsz} \ - actor_rollout_ref.actor.fsdp_config.param_offload=${offload} \ - actor_rollout_ref.actor.fsdp_config.optimizer_offload=${offload} \ - actor_rollout_ref.actor.entropy_coeff=0 \ - actor_rollout_ref.actor.grad_clip=1.0 \ - actor_rollout_ref.actor.loss_agg_mode=${loss_agg_mode} \ - actor_rollout_ref.actor.ulysses_sequence_parallel_size=${sp_size} \ - actor_rollout_ref.rollout.gpu_memory_utilization=0.80 \ - actor_rollout_ref.rollout.tensor_model_parallel_size=${gen_tp} \ - actor_rollout_ref.rollout.enable_chunked_prefill=True \ - actor_rollout_ref.rollout.max_num_batched_tokens=$((max_prompt_length + max_response_length)) \ - actor_rollout_ref.rollout.temperature=${temperature} \ - actor_rollout_ref.rollout.top_p=${top_p} \ - actor_rollout_ref.rollout.top_k=${top_k} \ - actor_rollout_ref.rollout.val_kwargs.temperature=${temperature} \ - actor_rollout_ref.rollout.val_kwargs.top_p=${val_top_p} \ - actor_rollout_ref.rollout.val_kwargs.top_k=${top_k} \ - actor_rollout_ref.rollout.val_kwargs.do_sample=True \ - actor_rollout_ref.rollout.val_kwargs.n=1 \ - actor_rollout_ref.rollout.name=${rollout_name} \ - actor_rollout_ref.rollout.mode=${rollout_mode} \ - actor_rollout_ref.ref.fsdp_config.param_offload=${offload} \ - actor_rollout_ref.ref.ulysses_sequence_parallel_size=${sp_size} \ - actor_rollout_ref.actor.fsdp_config.fsdp_size=${fsdp_size} \ - reward_model.reward_manager=dapo \ - +reward_model.reward_kwargs.overlong_buffer_cfg.enable=${enable_overlong_buffer} \ - +reward_model.reward_kwargs.overlong_buffer_cfg.len=${overlong_buffer_len} \ - +reward_model.reward_kwargs.overlong_buffer_cfg.penalty_factor=${overlong_penalty_factor} \ - +reward_model.reward_kwargs.overlong_buffer_cfg.log=False \ - +reward_model.reward_kwargs.max_resp_len=${max_response_length} \ - trainer.logger=['console','tensorboard'] \ - trainer.project_name="${project_name}" \ - trainer.experiment_name="${exp_name}" \ - trainer.n_gpus_per_node="${NGPUS_PER_NODE}" \ - trainer.nnodes="${NNODES}" \ - trainer.val_before_train=True \ - trainer.test_freq=-1 \ - trainer.save_freq=-1 \ - trainer.total_epochs=10 \ - trainer.total_training_steps=10 \ - trainer.default_local_dir="${CKPTS_DIR}" \ - trainer.resume_mode=auto \ - trainer.log_val_generations=10 diff --git a/recipe/fully_async_policy/shell/dapo_7b_math_megatron_colocate.sh b/recipe/fully_async_policy/shell/dapo_7b_math_megatron_colocate.sh deleted file mode 100644 index d05f5571876..00000000000 --- a/recipe/fully_async_policy/shell/dapo_7b_math_megatron_colocate.sh +++ /dev/null @@ -1,142 +0,0 @@ -#!/usr/bin/env bash -set -xeuo pipefail - -project_name='DAPO' -exp_name='DAPO-Qwen2.5-7b-MATH-0519a1-megatron-colocate' - -adv_estimator=grpo - -use_kl_in_reward=False -kl_coef=0.0 -use_kl_loss=False -kl_loss_coef=0.0 - -clip_ratio_low=0.2 -clip_ratio_high=0.28 - -max_prompt_length=$((1024 * 2)) -max_response_length=$((1024 * 8)) -enable_overlong_buffer=True -overlong_buffer_len=$((1024 * 4)) -overlong_penalty_factor=1.0 - -loss_agg_mode="token-mean" - -train_prompt_bsz=512 -n_resp_per_prompt=16 -train_prompt_mini_bsz=32 - -# Ray -# RAY_ADDRESS=${RAY_ADDRESS:-"http://localhost:8265"} -# WORKING_DIR=${WORKING_DIR:-"${PWD}"} -# RUNTIME_ENV=${RUNTIME_ENV:-"${WORKING_DIR}/verl/trainer/runtime_env.yaml"} -NNODES=${NNODES:-2} -NGPUS_PER_NODE=${NGPUS_PER_NODE:-8} -# Paths -RAY_DATA_HOME=${RAY_DATA_HOME:-"${HOME}/verl"} -# very important! please modify the max_position_embeddings in config.json to 32768 after downloading from huggingface -MODEL_PATH=${MODEL_PATH:-"${RAY_DATA_HOME}/models/Qwen2.5-Math-7B"} -CKPTS_DIR=${CKPTS_DIR:-"${RAY_DATA_HOME}/ckpts/${project_name}/${exp_name}"} -TRAIN_FILE=${TRAIN_FILE:-"${RAY_DATA_HOME}/data/dapo-math-17k.parquet"} -TEST_FILE=${TEST_FILE:-"${RAY_DATA_HOME}/data/aime-2024.parquet"} - -MODEL_PATH=/mnt/dolphinfs/hdd_pool/docker/user/hadoop-djst-algoplat/houzhenggang/modelscope/hub/models/Qwen/Qwen2___5-Math-7B -CKPTS_DIR=./ckpts/${project_name}/${exp_name} -TRAIN_FILE=/mnt/dolphinfs/hdd_pool/docker/user/hadoop-djst-algoplat/houzhenggang/data/dapo/dapo-math-17k.parquet -TEST_FILE=/mnt/dolphinfs/hdd_pool/docker/user/hadoop-djst-algoplat/houzhenggang/data/dapo/aime-2024.parquet - -# Algorithm -temperature=1.0 -top_p=1.0 -top_k=-1 # 0 for HF rollout, -1 for vLLM rollout -val_top_p=0.7 - -# Performance Related Parameter -use_dynamic_bsz=True -actor_ppo_max_token_len=$(((max_prompt_length + max_response_length) * 2)) -infer_ppo_max_token_len=$(((max_prompt_length + max_response_length) * 3)) -offload=True -gen_tp=2 -train_tp=2 -train_pp=2 - -# TODO: support dynamic_bsz for megatron -# actor_rollout_ref.actor.use_dynamic_bsz=${use_dynamic_bsz} \ -# actor_rollout_ref.ref.log_prob_use_dynamic_bsz=${use_dynamic_bsz} \ -# actor_rollout_ref.rollout.log_prob_use_dynamic_bsz=${use_dynamic_bsz} \ -# actor_rollout_ref.actor.ppo_max_token_len_per_gpu=${actor_ppo_max_token_len} \ -# actor_rollout_ref.ref.log_prob_max_token_len_per_gpu=${infer_ppo_max_token_len} \ -# actor_rollout_ref.rollout.log_prob_max_token_len_per_gpu=${infer_ppo_max_token_len} \ - -python3 -m verl.trainer.main_ppo \ - --config-path=config \ - --config-name='ppo_megatron_trainer.yaml' \ - data.train_files="${TRAIN_FILE}" \ - data.val_files="${TEST_FILE}" \ - data.prompt_key=prompt \ - data.truncation='left' \ - data.max_prompt_length=${max_prompt_length} \ - data.max_response_length=${max_response_length} \ - data.train_batch_size=${train_prompt_bsz} \ - actor_rollout_ref.rollout.n=${n_resp_per_prompt} \ - algorithm.adv_estimator=${adv_estimator} \ - algorithm.use_kl_in_reward=${use_kl_in_reward} \ - algorithm.kl_ctrl.kl_coef=${kl_coef} \ - actor_rollout_ref.actor.strategy=megatron \ - critic.strategy=megatron \ - actor_rollout_ref.actor.use_kl_loss=${use_kl_loss} \ - actor_rollout_ref.actor.kl_loss_coef=${kl_loss_coef} \ - actor_rollout_ref.actor.clip_ratio_low=${clip_ratio_low} \ - actor_rollout_ref.actor.clip_ratio_high=${clip_ratio_high} \ - actor_rollout_ref.actor.clip_ratio_c=10.0 \ - +actor_rollout_ref.model.override_config.max_position_embeddings=32768 \ - actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=2 \ - actor_rollout_ref.ref.log_prob_micro_batch_size_per_gpu=4 \ - actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu=4 \ - actor_rollout_ref.model.path="${MODEL_PATH}" \ - actor_rollout_ref.actor.optim.lr=1e-6 \ - actor_rollout_ref.actor.optim.lr_warmup_steps=10 \ - actor_rollout_ref.actor.optim.weight_decay=0.1 \ - actor_rollout_ref.actor.ppo_mini_batch_size=${train_prompt_mini_bsz} \ - actor_rollout_ref.actor.megatron.param_offload=${offload} \ - actor_rollout_ref.actor.megatron.optimizer_offload=${offload} \ - actor_rollout_ref.actor.megatron.grad_offload=${offload} \ - actor_rollout_ref.actor.megatron.pipeline_model_parallel_size=${train_pp} \ - actor_rollout_ref.actor.megatron.tensor_model_parallel_size=${train_tp} \ - actor_rollout_ref.actor.entropy_coeff=0 \ - actor_rollout_ref.actor.optim.clip_grad=1.0 \ - actor_rollout_ref.actor.loss_agg_mode=${loss_agg_mode} \ - actor_rollout_ref.rollout.gpu_memory_utilization=0.80 \ - actor_rollout_ref.rollout.tensor_model_parallel_size=${gen_tp} \ - actor_rollout_ref.rollout.enable_chunked_prefill=True \ - actor_rollout_ref.rollout.max_num_batched_tokens=$((max_prompt_length + max_response_length)) \ - actor_rollout_ref.rollout.temperature=${temperature} \ - actor_rollout_ref.rollout.top_p=${top_p} \ - actor_rollout_ref.rollout.top_k=${top_k} \ - actor_rollout_ref.rollout.val_kwargs.temperature=${temperature} \ - actor_rollout_ref.rollout.val_kwargs.top_p=${val_top_p} \ - actor_rollout_ref.rollout.val_kwargs.top_k=${top_k} \ - actor_rollout_ref.rollout.val_kwargs.do_sample=True \ - actor_rollout_ref.rollout.val_kwargs.n=1 \ - actor_rollout_ref.ref.megatron.pipeline_model_parallel_size=${train_pp} \ - actor_rollout_ref.ref.megatron.tensor_model_parallel_size=${train_tp} \ - actor_rollout_ref.ref.megatron.param_offload=${offload} \ - reward_model.reward_manager=dapo \ - +reward_model.reward_kwargs.overlong_buffer_cfg.enable=${enable_overlong_buffer} \ - +reward_model.reward_kwargs.overlong_buffer_cfg.len=${overlong_buffer_len} \ - +reward_model.reward_kwargs.overlong_buffer_cfg.penalty_factor=${overlong_penalty_factor} \ - +reward_model.reward_kwargs.overlong_buffer_cfg.log=False \ - +reward_model.reward_kwargs.max_resp_len=${max_response_length} \ - trainer.logger=['console','tensorboard'] \ - trainer.project_name="${project_name}" \ - trainer.experiment_name="${exp_name}" \ - trainer.n_gpus_per_node=8 \ - trainer.nnodes="${NNODES}" \ - trainer.val_before_train=True \ - trainer.test_freq=10 \ - trainer.save_freq=-1 \ - trainer.total_epochs=10 \ - trainer.total_training_steps=100 \ - trainer.default_local_dir="${CKPTS_DIR}" \ - trainer.resume_mode=auto \ - trainer.log_val_generations=10 diff --git a/recipe/fully_async_policy/shell/runtime_env.yaml b/recipe/fully_async_policy/shell/runtime_env.yaml index dcca08e67f7..88467b8c243 100644 --- a/recipe/fully_async_policy/shell/runtime_env.yaml +++ b/recipe/fully_async_policy/shell/runtime_env.yaml @@ -1,5 +1,4 @@ env_vars: VLLM_USE_V1: "1" - TENSORBOARD_DIR: "/mnt/dolphinfs/hdd_pool/docker/user/hadoop-djst-algoplat/houzhenggang/tensorboard/dapo_7b_math_fsdp2_4_12" NCCL_DEBUG: "INFO" HYDRA_FULL_ERROR: "1" \ No newline at end of file diff --git a/recipe/one_step_off_policy/fsdp_workers.py b/recipe/one_step_off_policy/fsdp_workers.py index 350c105087e..5bb96e2c0b3 100644 --- a/recipe/one_step_off_policy/fsdp_workers.py +++ b/recipe/one_step_off_policy/fsdp_workers.py @@ -281,7 +281,6 @@ def async_generate_sequences(self, prompts): output = output.to("cpu") # clear kv cache - get_torch_device().empty_cache() return output @register(dispatch_mode=Dispatch.ONE_TO_ALL) diff --git a/verl/workers/rollout/vllm_rollout/vllm_rollout_spmd.py b/verl/workers/rollout/vllm_rollout/vllm_rollout_spmd.py index baef0c9315e..456329f59ea 100644 --- a/verl/workers/rollout/vllm_rollout/vllm_rollout_spmd.py +++ b/verl/workers/rollout/vllm_rollout/vllm_rollout_spmd.py @@ -532,7 +532,6 @@ async def _loop_forever(self): def _init_worker(self, all_kwargs: list[dict[str, Any]]): """Initialize worker engine.""" - all_kwargs[0]["rank"] = int(os.environ["RANK"]) device_name = "NPU" if is_npu_available else "GPU" all_kwargs[0]["local_rank"] = ( From 5adde906bea75ac59f775704dd9b3bfd5cd0705f Mon Sep 17 00:00:00 2001 From: ArronHZG Date: Thu, 18 Sep 2025 22:13:05 +0800 Subject: [PATCH 157/182] update shel --- recipe/fully_async_policy/shell/dapo_7b_math_fsdp2_4_12.sh | 7 +------ recipe/fully_async_policy/shell/dapo_7b_math_fsdp2_4_4.sh | 7 +------ recipe/fully_async_policy/shell/dapo_7b_math_fsdp2_8_8.sh | 7 +------ 3 files changed, 3 insertions(+), 18 deletions(-) diff --git a/recipe/fully_async_policy/shell/dapo_7b_math_fsdp2_4_12.sh b/recipe/fully_async_policy/shell/dapo_7b_math_fsdp2_4_12.sh index f560468a4cf..dbfbee8fdfc 100644 --- a/recipe/fully_async_policy/shell/dapo_7b_math_fsdp2_4_12.sh +++ b/recipe/fully_async_policy/shell/dapo_7b_math_fsdp2_4_12.sh @@ -82,12 +82,7 @@ staleness_threshold=0.1 trigger_parameter_sync_step=16 partial_rollout=True -PYTHON_INTERPRETER="/home/hadoop-djst-algoplat/miniconda3/bin/python" -if [ ! -x "$PYTHON_INTERPRETER" ]; then - PYTHON_INTERPRETER="python3" -fi - -$PYTHON_INTERPRETER -m recipe.fully_async_policy.fully_async_main \ +python -m recipe.fully_async_policy.fully_async_main \ data.train_files="${TRAIN_FILE}" \ data.val_files="${TEST_FILE}" \ data.prompt_key=prompt \ diff --git a/recipe/fully_async_policy/shell/dapo_7b_math_fsdp2_4_4.sh b/recipe/fully_async_policy/shell/dapo_7b_math_fsdp2_4_4.sh index ef00feb9d05..6f64caaea0a 100644 --- a/recipe/fully_async_policy/shell/dapo_7b_math_fsdp2_4_4.sh +++ b/recipe/fully_async_policy/shell/dapo_7b_math_fsdp2_4_4.sh @@ -86,12 +86,7 @@ staleness_threshold=0.1 trigger_parameter_sync_step=16 partial_rollout=True -PYTHON_INTERPRETER="/home/hadoop-djst-algoplat/miniconda3/bin/python" -if [ ! -x "$PYTHON_INTERPRETER" ]; then - PYTHON_INTERPRETER="python3" -fi - -$PYTHON_INTERPRETER -m recipe.fully_async_policy.fully_async_main \ +python -m recipe.fully_async_policy.fully_async_main \ data.train_files="${TRAIN_FILE}" \ data.val_files="${TEST_FILE}" \ data.prompt_key=prompt \ diff --git a/recipe/fully_async_policy/shell/dapo_7b_math_fsdp2_8_8.sh b/recipe/fully_async_policy/shell/dapo_7b_math_fsdp2_8_8.sh index 85cdaa03fc5..02f7664360f 100644 --- a/recipe/fully_async_policy/shell/dapo_7b_math_fsdp2_8_8.sh +++ b/recipe/fully_async_policy/shell/dapo_7b_math_fsdp2_8_8.sh @@ -84,12 +84,7 @@ staleness_threshold=0.1 trigger_parameter_sync_step=16 partial_rollout=True -PYTHON_INTERPRETER="/home/hadoop-djst-algoplat/miniconda3/bin/python" -if [ ! -x "$PYTHON_INTERPRETER" ]; then - PYTHON_INTERPRETER="python3" -fi - -$PYTHON_INTERPRETER -m recipe.fully_async_policy.fully_async_main \ +python -m recipe.fully_async_policy.fully_async_main \ data.train_files="${TRAIN_FILE}" \ data.val_files="${TEST_FILE}" \ data.prompt_key=prompt \ From 3155b444286e358583a2a3d988b44b7aa9dc1362 Mon Sep 17 00:00:00 2001 From: wangshulin02 <953550366@qq.com> Date: Fri, 19 Sep 2025 11:53:13 +0800 Subject: [PATCH 158/182] fix notation --- recipe/fully_async_policy/fully_async_main.py | 2 +- recipe/fully_async_policy/fully_async_rollouter.py | 1 - recipe/fully_async_policy/param_sync.py | 4 ++-- 3 files changed, 3 insertions(+), 4 deletions(-) diff --git a/recipe/fully_async_policy/fully_async_main.py b/recipe/fully_async_policy/fully_async_main.py index 2f4ab8ccc6b..4dafd44844f 100644 --- a/recipe/fully_async_policy/fully_async_main.py +++ b/recipe/fully_async_policy/fully_async_main.py @@ -110,7 +110,7 @@ def create_role_worker_mapping(config): role_worker_mapping[Role.RewardModel] = ray.remote(RewardModelWorker) - # 添加reference policy(如果需要KL loss或reward) + # Add reference policy (if KL loss or reward is required) if config.algorithm.use_kl_in_reward or config.actor_rollout_ref.actor.use_kl_loss: role_worker_mapping[Role.RefPolicy] = ray.remote(DetachActorWorker) diff --git a/recipe/fully_async_policy/fully_async_rollouter.py b/recipe/fully_async_policy/fully_async_rollouter.py index b002d892e6a..00411b21a8f 100644 --- a/recipe/fully_async_policy/fully_async_rollouter.py +++ b/recipe/fully_async_policy/fully_async_rollouter.py @@ -165,7 +165,6 @@ async def set_max_required_samples(self): / (self.required_samples * self.config.async_training.trigger_parameter_sync_step) ) - # 单次最多扔一次更新需要的样本 self.max_concurrent_samples = len(self.async_rollout_manager.server_handles) * 16 self.max_concurrent_samples = min(self.max_concurrent_samples, self.max_required_samples) self.max_queue_size = self.max_required_samples diff --git a/recipe/fully_async_policy/param_sync.py b/recipe/fully_async_policy/param_sync.py index 55d11d236c0..b841019837a 100644 --- a/recipe/fully_async_policy/param_sync.py +++ b/recipe/fully_async_policy/param_sync.py @@ -96,10 +96,10 @@ def sync_weights(self, version, validate=False, global_steps=0): self.wait_last_resume = self.rollouter.resume.remote() def wait_last_valid(self): - print("[ParameterSynchronizer] waiting last validate...") + print("[ParameterSynchronizer] Waiting last validate...") start_time = time.time() if self.wait_last_update: ray.get(self.wait_last_update) if self.wait_last_resume: ray.get(self.wait_last_resume) - print(f"[ParameterSynchronizer], cost: {time.time() - start_time:.2f} seconds") + print(f"[ParameterSynchronizer] Wait last validate cost: {time.time() - start_time:.2f} seconds") From c39f283ef4702d8cc57412ed8a2c83d2038d1380 Mon Sep 17 00:00:00 2001 From: ArronHZG Date: Fri, 19 Sep 2025 15:56:56 +0800 Subject: [PATCH 159/182] rm print --- recipe/fully_async_policy/fsdp_workers.py | 3 --- recipe/fully_async_policy/fully_async_trainer.py | 5 +++-- verl/experimental/agent_loop/agent_loop.py | 8 ++++++++ 3 files changed, 11 insertions(+), 5 deletions(-) diff --git a/recipe/fully_async_policy/fsdp_workers.py b/recipe/fully_async_policy/fsdp_workers.py index 1ee54112c54..ad6b0db8b51 100644 --- a/recipe/fully_async_policy/fsdp_workers.py +++ b/recipe/fully_async_policy/fsdp_workers.py @@ -47,9 +47,6 @@ def get_inference_model(rollout): Returns: model: model object """ - - print(rollout) - print(dir(rollout)) inference_engine = rollout.inference_engine if hasattr(inference_engine, "llm_engine"): inference_model = inference_engine.llm_engine.model_executor.driver_worker.worker.model_runner.model diff --git a/recipe/fully_async_policy/fully_async_trainer.py b/recipe/fully_async_policy/fully_async_trainer.py index eb2a23867f8..a391a0c9c38 100644 --- a/recipe/fully_async_policy/fully_async_trainer.py +++ b/recipe/fully_async_policy/fully_async_trainer.py @@ -137,7 +137,7 @@ def _get_samples_from_queue(self) -> tuple[None, None] | tuple[int, Any]: # Collect samples using a simple loop calling get_sample consumer_start = time.time() queue_samples = [] - + queue_len = 0 while len(queue_samples) < self.required_samples: # Get a single sample and wait until there is a sample or None is received sample, queue_len = self.message_queue_client.get_sample_sync() @@ -166,7 +166,8 @@ def _get_samples_from_queue(self) -> tuple[None, None] | tuple[int, Any]: print( f"[FullyAsyncTrainer] Loop collection completed: {len(queue_samples)}/{self.required_samples} samples, " - f"total wait time: {total_wait_time:.2f} seconds" + f"total wait time: {total_wait_time:.2f} seconds." + f"mq_len: {queue_len}" ) queue_samples = [ray.cloudpickle.loads(x) for x in queue_samples] diff --git a/verl/experimental/agent_loop/agent_loop.py b/verl/experimental/agent_loop/agent_loop.py index ae56c2a187a..70b4d2e877e 100644 --- a/verl/experimental/agent_loop/agent_loop.py +++ b/verl/experimental/agent_loop/agent_loop.py @@ -691,9 +691,17 @@ def _postprocess(self, inputs: list[_InternalAgentLoopOutput]) -> DataProto: @ray.remote class AgentLoopWorker(AgentLoopWorkerBase): + """Agent loop worker takes a batch of messages and run each message in an agent loop.""" + def __init__( self, config: DictConfig, server_handles: list[ray.actor.ActorHandle], rm_executor: BatchExecutor = None ): + """Initialize agent loop manager. + + Args: + config (DictConfig): YAML config. + server_handles (List[ray.actor.ActorHandle]): OpenAI compatible LLM server actor handles. + """ super().__init__(config, server_handles, rm_executor) From e5116944a6c331b2a8ecb4d6e89a5dc7b78f0d58 Mon Sep 17 00:00:00 2001 From: wangshulin02 <953550366@qq.com> Date: Sun, 28 Sep 2025 17:20:34 +0800 Subject: [PATCH 160/182] fix log prob in hybird&streaming mode --- verl/trainer/config/actor/dp_actor.yaml | 3 +++ verl/workers/actor/dp_actor.py | 2 +- verl/workers/config/actor.py | 1 + 3 files changed, 5 insertions(+), 1 deletion(-) diff --git a/verl/trainer/config/actor/dp_actor.yaml b/verl/trainer/config/actor/dp_actor.yaml index ab27304f736..9969f7635b9 100644 --- a/verl/trainer/config/actor/dp_actor.yaml +++ b/verl/trainer/config/actor/dp_actor.yaml @@ -40,3 +40,6 @@ entropy_checkpointing: False # Whether to remove padding tokens in inputs during training use_remove_padding: ${oc.select:actor_rollout_ref.model.use_remove_padding,false} + +# Whether it's a hybrid engine +hybrid_engine: ${oc.select:actor_rollout_ref.hybrid_engine, True} \ No newline at end of file diff --git a/verl/workers/actor/dp_actor.py b/verl/workers/actor/dp_actor.py index d26a7244ee8..4c3b0dc9d23 100644 --- a/verl/workers/actor/dp_actor.py +++ b/verl/workers/actor/dp_actor.py @@ -394,7 +394,7 @@ def update_policy(self, data: DataProto): # See PPO paper for details. https://arxiv.org/abs/1707.06347 mini_batches = data.split(self.config.ppo_mini_batch_size) - on_policy = len(mini_batches) == 1 and self.config.ppo_epochs == 1 + on_policy = len(mini_batches) == 1 and self.config.ppo_epochs == 1 and self.config.hybrid_engine metrics = {} for _ in range(self.config.ppo_epochs): diff --git a/verl/workers/config/actor.py b/verl/workers/config/actor.py index af6199732b7..db1b5967abb 100644 --- a/verl/workers/config/actor.py +++ b/verl/workers/config/actor.py @@ -232,6 +232,7 @@ class FSDPActorConfig(ActorConfig): fsdp_config: FSDPEngineConfig = field(default_factory=FSDPEngineConfig) use_remove_padding: bool = False profiler: ProfilerConfig = field(default_factory=ProfilerConfig) + hybrid_engine: bool = True def __post_init__(self): """Validate FSDP actor configuration parameters.""" From 41cea0fb81d45005e81820dd96b47bb87a8104cd Mon Sep 17 00:00:00 2001 From: wangshulin02 <953550366@qq.com> Date: Mon, 29 Sep 2025 16:06:57 +0800 Subject: [PATCH 161/182] fix stale_samples_processed and stale_trajectory_processed metrics --- recipe/fully_async_policy/fully_async_trainer.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/recipe/fully_async_policy/fully_async_trainer.py b/recipe/fully_async_policy/fully_async_trainer.py index a391a0c9c38..8559433b625 100644 --- a/recipe/fully_async_policy/fully_async_trainer.py +++ b/recipe/fully_async_policy/fully_async_trainer.py @@ -314,10 +314,10 @@ def _collect_metrics_from_samples(self, batch, metrics): """ if hasattr(batch, "meta_info") and batch.meta_info: samples_param_versions = batch.meta_info["rollout_param_versions"] - stale_count = sum(1 for v in samples_param_versions if self.current_param_version - v > 1) + stale_count = sum(1 for v in samples_param_versions if self.current_param_version - v >= 1) self.stale_samples_processed += stale_count trajectory_param_versions = batch.meta_info["trajectory_param_versions"] - stale_traj_count = sum(1 for v in trajectory_param_versions if self.current_param_version - v > 1) + stale_traj_count = sum(1 for v in trajectory_param_versions if self.current_param_version - v >= 1) self.stale_trajectory_processed += stale_traj_count metrics.update( { From 97615b4b3d13de63d317bd58c31142a95a0c24be Mon Sep 17 00:00:00 2001 From: wangshulin02 <953550366@qq.com> Date: Sat, 11 Oct 2025 15:52:50 +0800 Subject: [PATCH 162/182] add require_batches config param --- .../fully_async_policy/config/fully_async_ppo_trainer.yaml | 3 +++ recipe/fully_async_policy/fully_async_rollouter.py | 5 +++-- recipe/fully_async_policy/fully_async_trainer.py | 5 +++-- 3 files changed, 9 insertions(+), 4 deletions(-) diff --git a/recipe/fully_async_policy/config/fully_async_ppo_trainer.yaml b/recipe/fully_async_policy/config/fully_async_ppo_trainer.yaml index c2708b975be..84a3cb7c290 100644 --- a/recipe/fully_async_policy/config/fully_async_ppo_trainer.yaml +++ b/recipe/fully_async_policy/config/fully_async_ppo_trainer.yaml @@ -15,6 +15,9 @@ async_training: # One step means trainer obtains a batch of required samples trigger_parameter_sync_step: 4 + # The number of ppo_mini_batches that the FullyAsyncTrainer obtains once + require_batches: 1 + # When synchronizing parameters, whether to interrupt rollouter and perform partial rollout partial_rollout: True diff --git a/recipe/fully_async_policy/fully_async_rollouter.py b/recipe/fully_async_policy/fully_async_rollouter.py index 00411b21a8f..c3ba74b5640 100644 --- a/recipe/fully_async_policy/fully_async_rollouter.py +++ b/recipe/fully_async_policy/fully_async_rollouter.py @@ -115,8 +115,9 @@ def __init__( # Config self.staleness_threshold: float = config.async_training.get("staleness_threshold", 1) - # required_samples use ppo_mini_batch_size as the minimum number of samples. - self.required_samples = config.actor_rollout_ref.actor.ppo_mini_batch_size + # required_samples use ppo_mini_batch_size*require_batches as the minimum number of samples. + self.require_batches = config.async_training.require_batches + self.required_samples = config.actor_rollout_ref.actor.ppo_mini_batch_size * self.require_batches self.max_required_samples = None self.max_concurrent_samples = None # queue size diff --git a/recipe/fully_async_policy/fully_async_trainer.py b/recipe/fully_async_policy/fully_async_trainer.py index 8559433b625..6693eac7406 100644 --- a/recipe/fully_async_policy/fully_async_trainer.py +++ b/recipe/fully_async_policy/fully_async_trainer.py @@ -97,8 +97,9 @@ def __init__( self.progress_bar = None self.trigger_parameter_sync_step = config.async_training.trigger_parameter_sync_step - # required_samples use ppo_mini_batch_size as the minimum number of samples. - self.required_samples = config.actor_rollout_ref.actor.ppo_mini_batch_size + # required_samples use ppo_mini_batch_size*require_batches as the minimum number of samples. + self.require_batches = config.async_training.require_batches + self.required_samples = config.actor_rollout_ref.actor.ppo_mini_batch_size * self.require_batches total_gpus = ( config.trainer.nnodes * config.trainer.n_gpus_per_node + config.rollout.nnodes * config.rollout.n_gpus_per_node From 211a441f3a3b124409a4462785ddfe6289aec6be Mon Sep 17 00:00:00 2001 From: wangshulin02 <953550366@qq.com> Date: Mon, 13 Oct 2025 20:54:33 +0800 Subject: [PATCH 163/182] fix staleness_samples reset bug --- .../fully_async_rollouter.py | 28 +++++++++++-------- recipe/fully_async_policy/param_sync.py | 4 +-- 2 files changed, 18 insertions(+), 14 deletions(-) diff --git a/recipe/fully_async_policy/fully_async_rollouter.py b/recipe/fully_async_policy/fully_async_rollouter.py index c3ba74b5640..4a81051a686 100644 --- a/recipe/fully_async_policy/fully_async_rollouter.py +++ b/recipe/fully_async_policy/fully_async_rollouter.py @@ -16,6 +16,7 @@ from pprint import pformat import ray +from ray import ObjectRef from recipe.fully_async_policy.detach_utils import ( RolloutSample, @@ -333,16 +334,7 @@ async def _processor_worker(self): """ Streaming worker coroutines, a sample is submitted for processing without waiting for batches """ - while True: - simple_from_cancel_queue = False - if not self.cancel_queue.empty(): - rollout_sample = await self.cancel_queue.get() - simple_from_cancel_queue = True - else: - rollout_sample = await self.pending_queue.get() - self.staleness_samples += 1 - if self.paused or await self._should_pause_generation(): print( "[FullyAsyncRollouter][Processor] Received pause signal, waiting for remaining tasks to return..." @@ -363,6 +355,15 @@ async def _processor_worker(self): while self.paused: self.idle_start_time = time.time() await self.condition.wait() + continue + + simple_from_cancel_queue = False + if not self.cancel_queue.empty(): + rollout_sample = await self.cancel_queue.get() + simple_from_cancel_queue = True + else: + rollout_sample = await self.pending_queue.get() + self.staleness_samples += 1 if rollout_sample == "DONE": print( @@ -567,6 +568,7 @@ async def _async_monitor_loop(self): async with self.lock: self.paused = False self.condition.notify_all() + print("[FullyAsyncRollouter][MonitorLoop] Trigger rollout recovery in MonitorLoop") async def _should_pause_generation(self) -> bool: """Determine whether the build should be paused""" @@ -581,12 +583,12 @@ async def _should_pause_generation(self) -> bool: ) return True - if self.staleness_samples > self.max_required_samples: + if self.staleness_samples >= self.max_required_samples: if not self.paused: print( "[FullyAsyncRollouter][ShouldPause] " f"due to " - f"staleness_samples {self.staleness_samples} > max_required_samples {self.max_required_samples} " + f"staleness_samples {self.staleness_samples} >= max_required_samples {self.max_required_samples} " ) return True @@ -607,7 +609,9 @@ async def pause(self): await self.async_rollout_manager.reset_prefix_cache() self.monitor_loop_trigger = False - async def resume(self): + async def resume(self, dependency_ref: ObjectRef = None): + if dependency_ref is not None: + ray.get(dependency_ref) print("[FullyAsyncRollouter][Public][Resume]") async with self.lock: self.paused = False diff --git a/recipe/fully_async_policy/param_sync.py b/recipe/fully_async_policy/param_sync.py index b841019837a..d6c67ceb409 100644 --- a/recipe/fully_async_policy/param_sync.py +++ b/recipe/fully_async_policy/param_sync.py @@ -93,10 +93,10 @@ def sync_weights(self, version, validate=False, global_steps=0): # Async Update rollout version & validation self.wait_last_update = self.rollouter.update_param_version.remote(version, validate, global_steps) - self.wait_last_resume = self.rollouter.resume.remote() + self.wait_last_resume = self.rollouter.resume.remote(self.wait_last_update) def wait_last_valid(self): - print("[ParameterSynchronizer] Waiting last validate...") + print("[ParameterSynchronizer] Waiting last sync and validate...") start_time = time.time() if self.wait_last_update: ray.get(self.wait_last_update) From f7a8e96608cb44eedccb1c02abbcd79bfe996665 Mon Sep 17 00:00:00 2001 From: wangshulin02 <953550366@qq.com> Date: Tue, 14 Oct 2025 10:15:21 +0800 Subject: [PATCH 164/182] del debug code --- recipe/fully_async_policy/fully_async_rollouter.py | 1 - 1 file changed, 1 deletion(-) diff --git a/recipe/fully_async_policy/fully_async_rollouter.py b/recipe/fully_async_policy/fully_async_rollouter.py index 4a81051a686..503c6ae6d5f 100644 --- a/recipe/fully_async_policy/fully_async_rollouter.py +++ b/recipe/fully_async_policy/fully_async_rollouter.py @@ -568,7 +568,6 @@ async def _async_monitor_loop(self): async with self.lock: self.paused = False self.condition.notify_all() - print("[FullyAsyncRollouter][MonitorLoop] Trigger rollout recovery in MonitorLoop") async def _should_pause_generation(self) -> bool: """Determine whether the build should be paused""" From 3de3ed04cfc4294cdbc1b19597b0978246b4c12f Mon Sep 17 00:00:00 2001 From: arron Date: Tue, 14 Oct 2025 11:31:54 +0800 Subject: [PATCH 165/182] add README_zh.md --- recipe/fully_async_policy/README.md | 66 ------ recipe/fully_async_policy/README_zh.md | 316 +++++++++++++++++++++++++ 2 files changed, 316 insertions(+), 66 deletions(-) delete mode 100644 recipe/fully_async_policy/README.md create mode 100644 recipe/fully_async_policy/README_zh.md diff --git a/recipe/fully_async_policy/README.md b/recipe/fully_async_policy/README.md deleted file mode 100644 index 0509969216b..00000000000 --- a/recipe/fully_async_policy/README.md +++ /dev/null @@ -1,66 +0,0 @@ -# 基于verl的改造方案 - -## 方案 - -### 方案1 (StreamRL, AsyncFlow) - -![StreamRL]( -https://github.com/ArronHZG/verl-community/blob/recipe/async_policy/docs/StreamRL.png?raw=true) - -在分离架构的基础上,修改在Rollout和Train的样本传递过程中,将离线策略生成一批global样本修改为生成一批batch的方式,实现生成和训练两阶段的高度重叠。 -训练阶段一收到足够样本就开始处理,训练一定步数后,将参数同步到PS侧, Rollout在每次样本生成完成后,check是否有新的参数,如果有就进行一次同步。 - -### 方案2 (Mistralai, Areal) - -![mistralai]( -https://github.com/ArronHZG/verl-community/blob/recipe/async_policy/docs/mistralai.png?raw=true) - -在分离架构的基础上,实现Rollout的partial rollout逻辑。样本仍然修改为batch的方式进行传递,实现生成和训练两阶段的高度重叠。 -在参数同步方面,训练阶段主动触发Rollout的暂停,参数同步以及恢复。 Rollout使用Rollout Server的方式,支持样本生成的中断与恢复, -产生的的样本所使用的参数版本会有所不同。 - -### 折中 - -上述两种方案的核心都是将训练与生成进行overlap,核心区别主要集中在参数同步的处理方式不同,方案1需要实现PS完成参数的异步加载。 -方案2使用同步的方式进行参数同步,但需要完成PartialRollout的逻辑。综合已有代码,以及社区进行中的工作,我们希望先将异步的工作流搭建完成,先以方案1进行开发,后续再进一步开发方案2。 - -## 设计 - -### 架构图 - -![full_async]( -https://github.com/ArronHZG/verl-community/blob/recipe/async_policy/docs/full_async.svg?raw=true) - -为实现纯异步训练工作流,基于已有的 one step off policy 代码,扩增实现 Rollouter 以及 Message Queue,以及对Trainer进行更新。 - -整体的训练流程参考StreamRL,将原有流程中生成 train_batch_size 个样本后进行下一步训练的过程,修改为流式的样本传递,train -拿到一次前向的样本后就进行样本分发(ppo_mini_batch_size*worker)。与one-step-off相比,我们将一次step的异步,继续细化到一次batch的异步。 - -**MessageQueue** 作为Ray的Actor存在,支持zeromq消息队列保存生成的样本,并提供给Trainer使用。Trainer 和 Rollouter 都持有 -MessageQueue 的Handler,通过接口完成样本的插入与消费。 - -**FullyAsyncRollouter** 类似于现有的 Trainer,实现fit()工作流,循环调用 Rollout 进行样本的生成。FullyAsyncRollouter 对于已有的 -vLLMAsyncRollout SGLangAsyncRollout 进行封装。 - -* 方案1,使用异步更新策略,FullyAsyncRollouter 根据样本生成的进展,自动访问PS,判断是否进行新的参数加载。 -* 方案2,参考PR https://github.com/volcengine/verl/pull/2246 https://github.com/volcengine/verl/pull/2200 Rollout - 组件需要支持暂停及恢复,从而进行参数的更新。暂停时,需要保存进行中的rollout样本,下次继续恢复生产。 - -**FullyAsyncTrainer** 与当前实现类似,区别是样本的获取修改为从Queue中获取,Queue有最少batch样本就开始进行分发。rainer完成一次step的训练后, -与FullyAsyncRollouter的使用策略对应: - -* 方案1,使用异步更新策略,参数产生后,主动同步到PS中。 -* 方案2,直接调用Rollouter进行同步,主动通知Rollouter暂停生成,进行参数的同步更新。 - -## 总结 - -当Rollouter生产快于Trainer消费时,queue中会存在多步过期的样本,我们需要在Rollouter中设置“陈旧度 staleness”阈值, -由当前的参数版本以及生成的样本数量,决定是否要暂停生成。zeromq 的最大长度应为 staleness * total_size,并且实现基于陈旧度的拒绝策略,进行防御性编程。 - -* 当使用方案1时,参数的同步由FullyAsyncRollouter主动控制,触发时机取决预先设置的固定数量样本完成以及参数已就绪,产生的样本所使用的参数版本一致, - 但是避免不了长尾的问题,会有"rollout空洞"产生。 - -* 当使用方案2时,参数的同步会更加及时,陈旧度低的样本数量较多,但是长尾样本由不同的参数产生,长尾样本的不同token所对应的参数版本会传递给训练引擎, - 后续可以根据这一信息对loss进行加权处理。 - -当Rollouter生产慢于Trainer消费时,队列长时间为空,基本等价于同步训练。 \ No newline at end of file diff --git a/recipe/fully_async_policy/README_zh.md b/recipe/fully_async_policy/README_zh.md new file mode 100644 index 00000000000..6f88c8187be --- /dev/null +++ b/recipe/fully_async_policy/README_zh.md @@ -0,0 +1,316 @@ +# Recipe: Fully Async Policy Async Trainer + +**Author:** `https://github.com/meituan-search` + +Last updated: 10/13/2025. + +本文档介绍了完全异步PPO训练系统,该系统实现了 Trainer 和 Rollouter 的完全解耦,支持异步样本生成和训练。 + +## Introduction + +### Background + +rollout和train分离架构相较于colocate的架构能够更加灵活地分配资源,设计更加灵活的训练逻辑,从而处理长尾等问题带来的GPU利用率低,训练效率低的问题。 +one_step_off_policy通过分离架构的设计并进行rollout和train一轮异步的训练方法,缓解了rollout时间过长的问题,并在训练效率上取得了一些收益, +但其强制使用一轮异步的数据,存在不够灵活等问题,而且并不能完全去除长尾对训练效率带来的的影响;在其他框架如areal、Magistral、streamrl、asyncflow上, +已经基于分离架构实现了异步训练、流式训练,并取得了收益;我们借鉴其方法,在verl上进行了实现。fully_async_policy支持异步、流式、partial +rollout的训练, +通过合理设置资源分配情况、参数同步频率等参数,fully_async_policy能够显著提高训练效率。 + +> Magistral https://arxiv.org/abs/2506.10910 +> +> AReaL: A Large-Scale Asynchronous Reinforcement Learning System for Language +> Reasoning https://arxiv.org/abs/2505.24298 +> +> StreamRL: Scalable, Heterogeneous, and Elastic RL for LLMs with Disaggregated Stream +> Generation https://arxiv.org/abs/2504.15930 +> +> AsyncFlow: An Asynchronous Streaming RL Framework for Efficient LLM Post-Training https://arxiv.org/abs/2507.01663 +> + +### 核心贡献 + +* 资源隔离:与使用hybrid_engine不同,Rollouter和Trainer使用分离的计算资源,需要分别指定所占用的资源。 +* 生成与训练并行:Trainer在训练的同时,Rollouter在生成新的样本。 +* 多步异步: 相比 one step off policy 支持0.x步到多步的异步设定,异步方案更加灵活。 +* nccl参数同步:使用nccl通信原语进行Rollouter与Trainer参数的通信。 +* Stream推理与训练:Rollouter逐样本生成数据,同时数据传输以单个sample为最小传输单位。 +* 异步训练与新鲜度控制:通过设置参数async_training.staleness_threshold,支持使用旧参数生成的样本进行训练。 +* PartialRollout: Rollouter推理过程支持partial rollout逻辑,通过参数同步时,添加sleep()和resume() + 逻辑,保存进行中的rollout的样本,并在下一次rollout中继续使用,减少参数同步等待进行中的任务结束时间。 + +目前支持使用模式为 fsdp+vllm。vllm必须使用基于AgentLoop的server模式。 + +## 设计 + +fully_async_policy的整体架构如下图所示,fully_async_policy主要由Rollouter、MessageQueue、Trainer、ParameterSynchronizer四部分组成。 + +![fully_async_policy_structure]( +https://github.com/ArronHZG/verl-community/blob/recipe/async_policy/docs/fully_async_policy_structure.svg?raw=true) + +1. Rollouter逐样本生成序列,并将生成的sample放入MessageQueue中,生产的速度受新鲜度控制。 +2. MessageQueue用于暂存Rollouter生成的sample。 +3. Trainer逐样本从MessageQueue中获取,获取到require_batches* + ppo_mini_batch_size数量的样本后,就会进行训练,训练async_training.trigger_parameter_sync_step轮后,触发与Rollouter的一次参数同步。 +4. ParameterSynchronizer 实现了Nccl的同步参数同步能力。 + +当前方案对比base的收益来源,在于colocate情况下,rollout使用更多的资源无法解决长尾样本带来的空闲,当我们进行资源隔离后,rollout的时间和train的时间都可能相较于之前更长(因为使用的资源变少了),但是相互之间的耗时overlap,端到端的耗时反而有所缩减。 + +![fully_async_policy_revenue]( +https://github.com/ArronHZG/verl-community/blob/recipe/async_policy/docs/fully_async_policy_revenue.svg?raw=true) + +## 使用方式 + +### 参数说明 + +| super params | implication | +|-----------------------------------------------|-----------------------------------------------------------------| +| `trainer.nnodes` | 表示Trainer的node数量 | +| `trainer.n_gpus_per_node` | 表示Trainer每个node上gpu的数量 | +| `rollout.nnodes` | 表示Rollouter的node数量 | +| `rollout.n_gpus_per_node` | 表示Rollouter每个node上gpu的数量 | +| `data.train_batch_size` | 在fully async策略中,该值不生效(默认设置为0) | +| `data.gen_batch_size` | 在fully async策略中,使用流式的样本生产逻辑(默认设置为1) | +| `rollout.total_rollout_steps` | 总的rollout的sample数量 | +| `rollout.test_freq` | 表示Rollouter每更新多少次参数,进行一次validation | +| `actor_rollout_ref.actor.ppo_mini_batch_size` | The ppo_mini_batch_size is a global num across all workers/gpus | +| `async_training.require_batches` | FullyAsyncTrainer一次性获取的ppo_mini_batch_size的数量 | +| `async_training.trigger_parameter_sync_step` | 表示FullyAsyncTrainer进行多少次本地更新后,进行一次参数同步 | +| `async_training.staleness_threshold` | 新鲜度控制 | +| `async_training.partial_rollout` | 是否进行partial_rollout | +| `async_training.use_rollout_log_probs` | 使用rollout产生的log_probs | + +进一步的解释: + +`rollout.total_rollout_steps` + +rollout.total_rollout_steps = data.train_batch_size * step + +`async_training.trigger_parameter_sync_step` + +在fully async策略中,表示Trainer进行多少次本地更新后(也就是获取多少次require_batches +*ppo_mini_batch_size数量样本),与Rollouter之间进行一次参数同步。 +每两次Rollouter和Trainer参数同步之间,Trainer将会处理trigger_parameter_sync_step*require_batches* +ppo_mini_batch_size份sample。 +如果为了与colocate比较,在公平的情况下对比速度,trigger_parameter_sync_step应该设置为 data.train_batch_size / ( +require_batches * ppo_mini_batch_size)。 + +`async_training.staleness_threshold` + +在fully async策略中,表示最大允许使用的staleness样本的比例。 +staleness_threshold=0,表示同步训练。 +Rollouter两次参数更新之间将会生成固定数量的样本,样本数为: + +$$$rollout\_num = (trigger\_parameter\_sync\_step*require\_batches*ppo\_mini\_batch\_size)$$$ + +staleness_threshold>0,表示异步训练, 可以设置为小数,支持更灵活的异步调用。 + +Rollouter两次参数更新之间将会最多生成的样本数为: + +$$$rollout\_num = (1+staleness\_threshold)*(trigger\_parameter\_sync\_step*require\_batches*ppo\_mini\_batch\_size) - num\_staleness\_sample $$$ + +num_staleness_sample 表示上一次rollout多生成的陈旧样本数。 +由于是流式系统,rollout持续生成,trainer持续消费。如果rollouter较慢,trainer会更早触发参数同步,rollouter并不会实际生产rollout_num个样本。 +当rollout 足够快时,staleness_threshold设置为1,基本上等价于one_step_off policy。 +为了避免过期样本太多影响训练精度,建议该值设置小于1。 + +`async_training.partial_rollout` +partial_rollout只会在staleness_threshold>0时才实际上起作用。 + +`async_training.use_rollout_log_probs` +在强化学习算法中,log_probs与参数版本,token都存在隐性的相关性。由于PPO/GRPO/DAPO等算法的设定,我们在计算重要性采样时, +即 old_log_prob必须使用rollout参数及token所对应log_probs,才能保证算法的正确性。在fully +async策略中,我们默认old_log_prob是有rollout所计算的,而不是由trainer所计算。 + +### 模式支持 + +1. on policy pipeline: + 1. trigger_parameter_sync_step=1,staleness_threshold=0; + 2. Rollouter一次生产require_batches* + ppo_mini_batch_size的samples,Trainer获取这些samples后进行训练,训练完后Trainer和Rollouter之间进行一次参数同步; + 3. 在rollout阶段,如果存在长尾的样本,但是rollout样本数较少时,较短的样本无法填充到空闲的资源中,会造成一定的资源浪费。 + 4. 如图a所示; + +2. stream off policy pipeline: + 1. trigger_parameter_sync_step>1,staleness_threshold=0。 + 2. 将会进行同步的流式训练,Rollouter一次生产require_batches*ppo_mini_batch_size* + trigger_parameter_sync_step的samples,Trainer每获取require_batches* + ppo_mini_batch_size就进行一次本地训练,训练trigger_parameter_sync_step次后,Trainer和Rollouter之间进行一次参数同步; + 3. 相较于a,由于一次生成的样本更多,资源的空闲会更低。 + 4. 在一次step训练中,会存在两次资源闲置的时间,分别是在第一次获取样本时,train等待require_batches* + ppo_mini_batch_size个样本生产,以及最后一次参数更新时,rollout等待训练完成。 + 5. 如图b所示; + +3. async stream pipeline with staleness samples: + 1. trigger_parameter_sync_step>=1,staleness_threshold>0,partial_rollout=Flase。 + 2. Rollouter在每次参数更新后将计划最多生产rollout_num个样本(实际根据rollout速度,生成的样本可能会少与这个值)。 + 3. + 如果rollout过程比较快,Rollouter将会在参数同步前额外生成一部分样本num_stale_samples,用于参数同步后立即给Trainer使用,如图c所示。触发参数同步时,如果Rollouter有正在生产的任务,将会等待任务完成,同时不会添加新的任务; + 4. 相较于b,除第一次step训练外,后续的训练都不会有wait first batch rollout finish的时间,但是会有wait active task + finish的时间。 + +4. async stream pipeline with partial rollout: + 1. trigger_parameter_sync_step>=1,staleness_threshold>0,partial_rollout=True。 + 2. 相较于c,触发参数同步时,Rollouter如果有正在生产的sample,会打断rollout过程并进行参数同步,被中断的sample会在参数同步后继续生成。减少了wait + active task finish的时间。 + +![fully_async_policy_mode]( +https://github.com/ArronHZG/verl-community/blob/recipe/async_policy/docs/fully_async_policy_mode.svg?raw=true) + +### 关键指标 + +| metrics | implication | +|------------------------------------------------|-----------------------------------------------------------| +| `trainer/idle_ratio` | Trainer闲置率 | +| `rollouter/idle_ratio` | Rollouter闲置率 | +| `fully_async/count/stale_samples_processed` | 训练使用的旧sample总数 | +| `fully_async/count/stale_trajectory_processed` | 训练使用的旧trajectory总数(一个sample会生产rollout.n条trajectory) | +| `fully_async/partial/total_partial_num` | 两次trigger_parameter_sync_step之间Trainer处理的partial样本数 | +| `fully_async/partial/partial_ratio` | 两次trigger_parameter_sync_step之间Trainer处理的partial样本的比例 | +| `fully_async/partial/max_partial_span` | 两次trigger_parameter_sync_step之间Trainer处理的partial样本的最大参数跨度 | + +### 调参建议 + +* 资源分配与调整: + * + 合理的资源分配是获得好的训练效率的前提。理想的资源分配情况应该是使得Rollout的时间和Train的时间接近,从而使得整个训练过程流水气泡最小,避免资源闲置,同时Trainer不会使用旧样本。在真实训练场景下,可以根据实际训练过程中rollout和train的空闲时间调整资源分配,可从rollouter/idle_ratio和trainer/idle_ratio获得,如果rollouter/idle_ratio较高trainer/idle_ratio较低,应该增多Trainer的资源减少Rollouter的资源,反之亦然。 + +* 关键参数: + * staleness_threshold: 设置太大会导致较多的旧样本使用,影响模型效果,建议设置小于1。 + * require_batches:越接近1,越接近纯流式过程,训练过程中bubble越小,能够在速度上获得更快的加速效果,但会对样本的处理顺序产生影响; + * trigger_parameter_sync_step: 设置的越小越接近on + policy但会导致频繁的参数同步,同时server模式,长尾样本浪费的资源无法被短样本填充,资源利用率低。设置的越大有更高的计算效率,但是精度上会受到off + policy的影响。 + * rollout.test_freq: 会占用Rollouter资源,不建议设置太小。 + +* 模式选择:正如[模式支持]章节介绍,通过调整不同的参数,Fully Async架构支持不同程度上的优化加速,适用于不同场景的任务。 + * 对于小规模任务,需要保证训练的稳定性和 on-policy 性,对速度要求不高的场景,可以尝试使用on policy pipeline的模式(模式1)。 + * 对于需要提高训练吞吐量,但对 staleness 敏感的场景,可以尝试使用 stream off policy pipeline 的模式。即通过 + 设置trigger_parameter_sync_step>1 ,提高 训练效率,但仍保持同步机制 (staleness_threshold=0 )(模式2)。 + * 对于大规模任务,对训练速度有较高要求,且可以容忍一定 off-policy 程度、staleness的场景,可以设置staleness_threshold> + 0、partial_rollout=True提高训练效率,使用 async stream pipeline 模式(模式 3 或 4)。 + +### 快速开始 + +```shell +rollout_mode="async" +rollout_name="vllm" # sglang or vllm +if [ "$rollout_mode" = "async" ]; then + export VLLM_USE_V1=1 + return_raw_chat="True" +fi + +train_prompt_bsz=0 +gen_prompt_bsz=1 +n_resp_per_prompt=16 +train_prompt_mini_bsz=32 +total_rollout_steps=$(((512*400))) +test_freq=10 +staleness_threshold=0 +trigger_parameter_sync_step=16 +partial_rollout=False + + +python -m recipe.fully_async_policy.fully_async_main \ + train_batch_size=${train_prompt_bsz} \ + data.gen_batch_size=${gen_prompt_bsz} \ + data.return_raw_chat=${return_raw_chat} \ + actor_rollout_ref.rollout.n=${n_resp_per_prompt} \ + actor_rollout_ref.actor.strategy=fsdp2 \ + critic.strategy=fsdp2 \ + actor_rollout_ref.hybrid_engine=False \ + actor_rollout_ref.actor.use_dynamic_bsz=${use_dynamic_bsz} \ + actor_rollout_ref.ref.log_prob_use_dynamic_bsz=${use_dynamic_bsz} \ + actor_rollout_ref.rollout.log_prob_use_dynamic_bsz=${use_dynamic_bsz} \ + actor_rollout_ref.rollout.name=${rollout_name} \ + actor_rollout_ref.rollout.mode=${rollout_mode} \ + actor_rollout_ref.rollout.calculate_log_probs=True \ + trainer.nnodes="${NNODES_TRAIN}" \ + trainer.n_gpus_per_node="${NGPUS_PER_NODE}" \ + rollout.nnodes="${NNODES_ROLLOUT}" \ + rollout.n_gpus_per_node="${NGPUS_PER_NODE}" \ + rollout.total_rollout_steps="${total_rollout_steps}" \ + rollout.test_freq="${test_freq}" \ + async_training.staleness_threshold="${staleness_threshold}" \ + async_training.trigger_parameter_sync_step="${trigger_parameter_sync_step}" \ + async_training.partial_rollout="${partial_rollout}" +``` + +## 实验 + +### 在7B模型上进行异步训练 + +* 机器:H20 +* 模型:Qwen2.5-Math-7B +* rollout长度:max_response_length FSDP2: 28K tokens; +* 算法:DAPO +* 数据集: TRAIN_FILE: dapo-math-17k.parquet TEST_FILE: aime-2024.parquet +* engine: vllm+FSDP2 +* rollout.n: 16 +* ppo_mini_batch_size: 32 +* test_freq: 20 + +* colocate sync: + * step: 400 + * train_batch_size: 512 + +* fully_async_policy + * total_rollout_steps: 512*400 + * require_batches: 4 + * trigger_parameter_sync_step: 4 + * staleness_threshold: 0.3 + * partial_rollout: True + +| training mode | Resource allocation | step | generate_sequences | old_log_prob | update_actor | total time | acc/best@32/mean | +|--------------------|---------------------|------|--------------------|--------------|--------------|------------|------------------| +| colocate sync | 32 | | | | | | | +| fully_async_policy | 16:16 | | | | | | | +| colocate sync | 64 | | | | | | | +| fully_async_policy | 32:32 | | | | | | | +| colocate sync | 128 | | | | | | | +| fully_async_policy | 64:64 | | | | | | | + +> https://wandb.ai/hou-zg-meituan/fully-async-policy?nw=nwuserhouzg + +### 30B模型模式实验 + +* 机器: H20 +* 模型:Qwen2.5-32B +* rollout长度:max_response_length FSDP2: 20K tokens; +* 算法:DAPO +* engine: vllm+FSDP2 +* rollout.n: 16 +* ppo_mini_batch_size: 32 +* test_freq: 20 + +* colacate sync: + * step:200 + * train_batch_size: 512 + +* fully_async_policy + * total_rollout_steps: 512*200 + * trigger_parameter_sync_step: 512/32 = 16 + * staleness_threshold: 0 + * partial_rollout: False + +| training mode | Resource allocation | mode | step | generate_sequences | old_log_prob | update_actor | total time | acc/best@32/mean | +|--------------------|---------------------|----------------------------------------------|------|--------------------|--------------|--------------|------------|------------------| +| colocate sync | 128 | | | | | | | | +| fully_async_policy | 64:64 | stream off policy pipeline | | | | | | | +| fully_async_policy | 64:64 | async stream pipeline with staleness samples | | | | | | | +| fully_async_policy | 64:64 | async stream pipeline with partial rollout | | | | | | | + +### 128卡 require_batches 消融实验 + +### 128卡 stale 消融实验 + +## 后续计划 + +* GRPO实验 +* megatron 适配 +* sglang 集成 +* transfer queue 集成 +* 异步参数同步 +* Areal异步算法实现 +* TPPO算法实现 +* 多轮及Tool的支持 \ No newline at end of file From f658643036cf6016538a1a185727560e92e10738 Mon Sep 17 00:00:00 2001 From: ArronHZG Date: Tue, 14 Oct 2025 11:49:19 +0800 Subject: [PATCH 166/182] update README_zh.md --- recipe/fully_async_policy/README_zh.md | 96 +++++++++++++++----------- 1 file changed, 56 insertions(+), 40 deletions(-) diff --git a/recipe/fully_async_policy/README_zh.md b/recipe/fully_async_policy/README_zh.md index 6f88c8187be..8d7ce356335 100644 --- a/recipe/fully_async_policy/README_zh.md +++ b/recipe/fully_async_policy/README_zh.md @@ -14,8 +14,7 @@ rollout和train分离架构相较于colocate的架构能够更加灵活地分配 one_step_off_policy通过分离架构的设计并进行rollout和train一轮异步的训练方法,缓解了rollout时间过长的问题,并在训练效率上取得了一些收益, 但其强制使用一轮异步的数据,存在不够灵活等问题,而且并不能完全去除长尾对训练效率带来的的影响;在其他框架如areal、Magistral、streamrl、asyncflow上, 已经基于分离架构实现了异步训练、流式训练,并取得了收益;我们借鉴其方法,在verl上进行了实现。fully_async_policy支持异步、流式、partial -rollout的训练, -通过合理设置资源分配情况、参数同步频率等参数,fully_async_policy能够显著提高训练效率。 +rollout的训练, 通过合理设置资源分配情况、参数同步频率等参数,fully_async_policy能够显著提高训练效率。 > Magistral https://arxiv.org/abs/2506.10910 > @@ -30,13 +29,13 @@ rollout的训练, ### 核心贡献 -* 资源隔离:与使用hybrid_engine不同,Rollouter和Trainer使用分离的计算资源,需要分别指定所占用的资源。 -* 生成与训练并行:Trainer在训练的同时,Rollouter在生成新的样本。 -* 多步异步: 相比 one step off policy 支持0.x步到多步的异步设定,异步方案更加灵活。 -* nccl参数同步:使用nccl通信原语进行Rollouter与Trainer参数的通信。 -* Stream推理与训练:Rollouter逐样本生成数据,同时数据传输以单个sample为最小传输单位。 -* 异步训练与新鲜度控制:通过设置参数async_training.staleness_threshold,支持使用旧参数生成的样本进行训练。 -* PartialRollout: Rollouter推理过程支持partial rollout逻辑,通过参数同步时,添加sleep()和resume() +* **资源隔离**:与使用hybrid_engine不同,Rollouter和Trainer使用分离的计算资源,需要分别指定所占用的资源。 +* **生成与训练并行**:Trainer在训练的同时,Rollouter在生成新的样本。 +* **多步异步**: 相比 one step off policy 支持0.x步到多步的异步设定,异步方案更加灵活。 +* **nccl参数同步**:使用nccl通信原语进行Rollouter与Trainer参数的通信。 +* **Stream推理与训练**:Rollouter逐样本生成数据,同时数据传输以单个sample为最小传输单位。 +* **异步训练与新鲜度控制**:通过设置参数async_training.staleness_threshold,支持使用旧参数生成的样本进行训练。 +* **PartialRollout**: Rollouter推理过程支持partial rollout逻辑,通过参数同步时,添加`sleep()`和`resume()` 逻辑,保存进行中的rollout的样本,并在下一次rollout中继续使用,减少参数同步等待进行中的任务结束时间。 目前支持使用模式为 fsdp+vllm。vllm必须使用基于AgentLoop的server模式。 @@ -50,11 +49,13 @@ https://github.com/ArronHZG/verl-community/blob/recipe/async_policy/docs/fully_a 1. Rollouter逐样本生成序列,并将生成的sample放入MessageQueue中,生产的速度受新鲜度控制。 2. MessageQueue用于暂存Rollouter生成的sample。 -3. Trainer逐样本从MessageQueue中获取,获取到require_batches* - ppo_mini_batch_size数量的样本后,就会进行训练,训练async_training.trigger_parameter_sync_step轮后,触发与Rollouter的一次参数同步。 +3. Trainer逐样本从MessageQueue中获取,获取到`require_batches*ppo_mini_batch_size` + 数量的样本后,就会进行训练,训练async_training.trigger_parameter_sync_step轮后,触发与Rollouter的一次参数同步。 4. ParameterSynchronizer 实现了Nccl的同步参数同步能力。 -当前方案对比base的收益来源,在于colocate情况下,rollout使用更多的资源无法解决长尾样本带来的空闲,当我们进行资源隔离后,rollout的时间和train的时间都可能相较于之前更长(因为使用的资源变少了),但是相互之间的耗时overlap,端到端的耗时反而有所缩减。 +当前方案对比base的收益来源,在于colocate情况下,rollout使用更多的资源无法解决长尾样本带来的空闲, +当我们进行资源隔离后,rollout的时间和train的时间都可能相较于之前更长(因为使用的资源变少了), +但是相互之间的耗时overlap,端到端的耗时反而有所缩减。 ![fully_async_policy_revenue]( https://github.com/ArronHZG/verl-community/blob/recipe/async_policy/docs/fully_async_policy_revenue.svg?raw=true) @@ -65,14 +66,14 @@ https://github.com/ArronHZG/verl-community/blob/recipe/async_policy/docs/fully_a | super params | implication | |-----------------------------------------------|-----------------------------------------------------------------| -| `trainer.nnodes` | 表示Trainer的node数量 | -| `trainer.n_gpus_per_node` | 表示Trainer每个node上gpu的数量 | -| `rollout.nnodes` | 表示Rollouter的node数量 | -| `rollout.n_gpus_per_node` | 表示Rollouter每个node上gpu的数量 | +| `trainer.nnodes` | Trainer的node数量 | +| `trainer.n_gpus_per_node` | Trainer每个node上gpu的数量 | +| `rollout.nnodes` | Rollouter的node数量 | +| `rollout.n_gpus_per_node` | Rollouter每个node上gpu的数量 | | `data.train_batch_size` | 在fully async策略中,该值不生效(默认设置为0) | | `data.gen_batch_size` | 在fully async策略中,使用流式的样本生产逻辑(默认设置为1) | | `rollout.total_rollout_steps` | 总的rollout的sample数量 | -| `rollout.test_freq` | 表示Rollouter每更新多少次参数,进行一次validation | +| `rollout.test_freq` | Rollouter每更新多少次参数,进行一次validation | | `actor_rollout_ref.actor.ppo_mini_batch_size` | The ppo_mini_batch_size is a global num across all workers/gpus | | `async_training.require_batches` | FullyAsyncTrainer一次性获取的ppo_mini_batch_size的数量 | | `async_training.trigger_parameter_sync_step` | 表示FullyAsyncTrainer进行多少次本地更新后,进行一次参数同步 | @@ -80,44 +81,45 @@ https://github.com/ArronHZG/verl-community/blob/recipe/async_policy/docs/fully_a | `async_training.partial_rollout` | 是否进行partial_rollout | | `async_training.use_rollout_log_probs` | 使用rollout产生的log_probs | -进一步的解释: +**进一步的解释:** `rollout.total_rollout_steps` -rollout.total_rollout_steps = data.train_batch_size * step +与 colocate 相比,数量可以通过 train_batch_size 与 step 相乘对齐: rollout.total_rollout_steps = data.train_batch_size * +step。 `async_training.trigger_parameter_sync_step` -在fully async策略中,表示Trainer进行多少次本地更新后(也就是获取多少次require_batches -*ppo_mini_batch_size数量样本),与Rollouter之间进行一次参数同步。 -每两次Rollouter和Trainer参数同步之间,Trainer将会处理trigger_parameter_sync_step*require_batches* +在fully async策略中,表示Trainer进行多少次本地更新后(也就是获取多少次require_batches\* ppo_mini_batch_size数量样本), +与Rollouter之间进行一次参数同步。 +每两次Rollouter和Trainer参数同步之间,Trainer将会处理trigger_parameter_sync_step\* require_batches\* ppo_mini_batch_size份sample。 -如果为了与colocate比较,在公平的情况下对比速度,trigger_parameter_sync_step应该设置为 data.train_batch_size / ( -require_batches * ppo_mini_batch_size)。 +如果为了与colocate在公平的情况下对比速度,trigger_parameter_sync_step应该设置为 data.train_batch_size / ( +require_batches \* ppo_mini_batch_size)。 `async_training.staleness_threshold` 在fully async策略中,表示最大允许使用的staleness样本的比例。 -staleness_threshold=0,表示同步训练。 -Rollouter两次参数更新之间将会生成固定数量的样本,样本数为: -$$$rollout\_num = (trigger\_parameter\_sync\_step*require\_batches*ppo\_mini\_batch\_size)$$$ - -staleness_threshold>0,表示异步训练, 可以设置为小数,支持更灵活的异步调用。 - -Rollouter两次参数更新之间将会最多生成的样本数为: - -$$$rollout\_num = (1+staleness\_threshold)*(trigger\_parameter\_sync\_step*require\_batches*ppo\_mini\_batch\_size) - num\_staleness\_sample $$$ +* staleness_threshold=0,表示同步训练。 + Rollouter两次参数更新之间将会生成固定数量的样本,样本数为: + $$rollout\_num = (trigger\_parameter\_sync\_step*require\_batches*ppo\_mini\_batch\_size)$$ +* staleness_threshold>0,表示异步训练, 可以设置为小数,支持更灵活的异步调用。 + Rollouter两次参数更新之间将会最多生成的样本数为: + $$rollout\_num = (1+staleness\_threshold)*(trigger\_parameter\_sync\_step*require\_batches*ppo\_mini\_batch\_size) - num\_staleness\_sample $$ num_staleness_sample 表示上一次rollout多生成的陈旧样本数。 + 由于是流式系统,rollout持续生成,trainer持续消费。如果rollouter较慢,trainer会更早触发参数同步,rollouter并不会实际生产rollout_num个样本。 当rollout 足够快时,staleness_threshold设置为1,基本上等价于one_step_off policy。 为了避免过期样本太多影响训练精度,建议该值设置小于1。 `async_training.partial_rollout` + partial_rollout只会在staleness_threshold>0时才实际上起作用。 `async_training.use_rollout_log_probs` + 在强化学习算法中,log_probs与参数版本,token都存在隐性的相关性。由于PPO/GRPO/DAPO等算法的设定,我们在计算重要性采样时, 即 old_log_prob必须使用rollout参数及token所对应log_probs,才能保证算法的正确性。在fully async策略中,我们默认old_log_prob是有rollout所计算的,而不是由trainer所计算。 @@ -144,15 +146,17 @@ async策略中,我们默认old_log_prob是有rollout所计算的,而不是 3. async stream pipeline with staleness samples: 1. trigger_parameter_sync_step>=1,staleness_threshold>0,partial_rollout=Flase。 2. Rollouter在每次参数更新后将计划最多生产rollout_num个样本(实际根据rollout速度,生成的样本可能会少与这个值)。 - 3. - 如果rollout过程比较快,Rollouter将会在参数同步前额外生成一部分样本num_stale_samples,用于参数同步后立即给Trainer使用,如图c所示。触发参数同步时,如果Rollouter有正在生产的任务,将会等待任务完成,同时不会添加新的任务; + 3. 如果rollout过程比较快,Rollouter将会在参数同步前额外生成一部分样本num_stale_samples,用于参数同步后立即给Trainer使用。 + 触发参数同步时,如果Rollouter有正在生产的任务,将会等待任务完成,同时不会添加新的任务; 4. 相较于b,除第一次step训练外,后续的训练都不会有wait first batch rollout finish的时间,但是会有wait active task finish的时间。 + 5. 如图c所示; 4. async stream pipeline with partial rollout: 1. trigger_parameter_sync_step>=1,staleness_threshold>0,partial_rollout=True。 2. 相较于c,触发参数同步时,Rollouter如果有正在生产的sample,会打断rollout过程并进行参数同步,被中断的sample会在参数同步后继续生成。减少了wait active task finish的时间。 + 3. 如图d所示; ![fully_async_policy_mode]( https://github.com/ArronHZG/verl-community/blob/recipe/async_policy/docs/fully_async_policy_mode.svg?raw=true) @@ -178,12 +182,11 @@ https://github.com/ArronHZG/verl-community/blob/recipe/async_policy/docs/fully_a * 关键参数: * staleness_threshold: 设置太大会导致较多的旧样本使用,影响模型效果,建议设置小于1。 * require_batches:越接近1,越接近纯流式过程,训练过程中bubble越小,能够在速度上获得更快的加速效果,但会对样本的处理顺序产生影响; - * trigger_parameter_sync_step: 设置的越小越接近on - policy但会导致频繁的参数同步,同时server模式,长尾样本浪费的资源无法被短样本填充,资源利用率低。设置的越大有更高的计算效率,但是精度上会受到off - policy的影响。 + * trigger_parameter_sync_step: 设置的越小越接近on policy,但会导致频繁的参数同步,长尾样本浪费的资源无法被短样本填充,资源利用率低。 + 设置的越大有更高的计算效率,但是精度上会受到off policy的影响。 * rollout.test_freq: 会占用Rollouter资源,不建议设置太小。 -* 模式选择:正如[模式支持]章节介绍,通过调整不同的参数,Fully Async架构支持不同程度上的优化加速,适用于不同场景的任务。 +* 模式选择:通过调整不同的参数,Fully Async架构支持不同程度上的优化加速,适用于不同场景的任务。 * 对于小规模任务,需要保证训练的稳定性和 on-policy 性,对速度要求不高的场景,可以尝试使用on policy pipeline的模式(模式1)。 * 对于需要提高训练吞吐量,但对 staleness 敏感的场景,可以尝试使用 stream off policy pipeline 的模式。即通过 设置trigger_parameter_sync_step>1 ,提高 训练效率,但仍保持同步机制 (staleness_threshold=0 )(模式2)。 @@ -302,14 +305,27 @@ python -m recipe.fully_async_policy.fully_async_main \ ### 128卡 require_batches 消融实验 +| training mode | Resource allocation | require_size | step | generate_sequences | old_log_prob | update_actor | total time | acc/best@32/mean | +|--------------------|---------------------|--------------|------|--------------------|--------------|--------------|------------|------------------| +| fully_async_policy | 64:64 | 1 | | | | | | | +| fully_async_policy | 64:64 | 2 | | | | | | | +| fully_async_policy | 64:64 | 4 | | | | | | | + ### 128卡 stale 消融实验 +| training mode | Resource allocation | staleness | step | generate_sequences | old_log_prob | update_actor | total time | acc/best@32/mean | +|--------------------|---------------------|-----------|------|--------------------|--------------|--------------|------------|------------------| +| fully_async_policy | 64:64 | 0 | | | | | | | +| fully_async_policy | 64:64 | 0.1 | | | | | | | +| fully_async_policy | 64:64 | 0.3 | | | | | | | +| fully_async_policy | 64:64 | 0.5 | | | | | | | + ## 后续计划 * GRPO实验 * megatron 适配 * sglang 集成 -* transfer queue 集成 +* transfer queue 集成 * 异步参数同步 * Areal异步算法实现 * TPPO算法实现 From 1a3759e30e7d390d803b0db1a9f1dcee579072ff Mon Sep 17 00:00:00 2001 From: ArronHZG Date: Tue, 14 Oct 2025 11:50:57 +0800 Subject: [PATCH 167/182] update README_zh.md --- recipe/fully_async_policy/README_zh.md | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/recipe/fully_async_policy/README_zh.md b/recipe/fully_async_policy/README_zh.md index 8d7ce356335..d1080d584a4 100644 --- a/recipe/fully_async_policy/README_zh.md +++ b/recipe/fully_async_policy/README_zh.md @@ -273,7 +273,7 @@ python -m recipe.fully_async_policy.fully_async_main \ | colocate sync | 128 | | | | | | | | fully_async_policy | 64:64 | | | | | | | -> https://wandb.ai/hou-zg-meituan/fully-async-policy?nw=nwuserhouzg +>source data: https://wandb.ai/hou-zg-meituan/fully-async-policy?nw=nwuserhouzg ### 30B模型模式实验 @@ -303,6 +303,8 @@ python -m recipe.fully_async_policy.fully_async_main \ | fully_async_policy | 64:64 | async stream pipeline with staleness samples | | | | | | | | fully_async_policy | 64:64 | async stream pipeline with partial rollout | | | | | | | +>source data: https://wandb.ai/hou-zg-meituan/fully-async-policy?nw=nwuserhouzg + ### 128卡 require_batches 消融实验 | training mode | Resource allocation | require_size | step | generate_sequences | old_log_prob | update_actor | total time | acc/best@32/mean | @@ -311,6 +313,8 @@ python -m recipe.fully_async_policy.fully_async_main \ | fully_async_policy | 64:64 | 2 | | | | | | | | fully_async_policy | 64:64 | 4 | | | | | | | +>source data: https://wandb.ai/hou-zg-meituan/fully-async-policy?nw=nwuserhouzg + ### 128卡 stale 消融实验 | training mode | Resource allocation | staleness | step | generate_sequences | old_log_prob | update_actor | total time | acc/best@32/mean | @@ -320,6 +324,8 @@ python -m recipe.fully_async_policy.fully_async_main \ | fully_async_policy | 64:64 | 0.3 | | | | | | | | fully_async_policy | 64:64 | 0.5 | | | | | | | +>source data: https://wandb.ai/hou-zg-meituan/fully-async-policy?nw=nwuserhouzg + ## 后续计划 * GRPO实验 From ed73079fb67352d20ab0bf6b9bceffea611d7350 Mon Sep 17 00:00:00 2001 From: ArronHZG Date: Tue, 14 Oct 2025 18:56:25 +0800 Subject: [PATCH 168/182] update README --- recipe/fully_async_policy/README_zh.md | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) diff --git a/recipe/fully_async_policy/README_zh.md b/recipe/fully_async_policy/README_zh.md index d1080d584a4..7d385e03abd 100644 --- a/recipe/fully_async_policy/README_zh.md +++ b/recipe/fully_async_policy/README_zh.md @@ -176,8 +176,10 @@ https://github.com/ArronHZG/verl-community/blob/recipe/async_policy/docs/fully_a ### 调参建议 * 资源分配与调整: - * - 合理的资源分配是获得好的训练效率的前提。理想的资源分配情况应该是使得Rollout的时间和Train的时间接近,从而使得整个训练过程流水气泡最小,避免资源闲置,同时Trainer不会使用旧样本。在真实训练场景下,可以根据实际训练过程中rollout和train的空闲时间调整资源分配,可从rollouter/idle_ratio和trainer/idle_ratio获得,如果rollouter/idle_ratio较高trainer/idle_ratio较低,应该增多Trainer的资源减少Rollouter的资源,反之亦然。 + * 合理的资源分配是获得好的训练效率的前提。理想的资源分配情况应该是使得Rollout的时间和Train的时间接近,从而使得整个训练过程流水气泡最小, + 避免资源闲置,同时Trainer不会使用旧样本。在真实训练场景下,可以根据实际训练过程中rollout和train的空闲时间调整资源分配, + 可从rollouter/idle_ratio和trainer/idle_ratio获得,如果rollouter/idle_ratio较高trainer/idle_ratio较低, + 应该增多Trainer的资源减少Rollouter的资源,反之亦然。 * 关键参数: * staleness_threshold: 设置太大会导致较多的旧样本使用,影响模型效果,建议设置小于1。 @@ -273,7 +275,7 @@ python -m recipe.fully_async_policy.fully_async_main \ | colocate sync | 128 | | | | | | | | fully_async_policy | 64:64 | | | | | | | ->source data: https://wandb.ai/hou-zg-meituan/fully-async-policy?nw=nwuserhouzg +> source data: https://wandb.ai/hou-zg-meituan/fully-async-policy?nw=nwuserhouzg ### 30B模型模式实验 @@ -303,7 +305,7 @@ python -m recipe.fully_async_policy.fully_async_main \ | fully_async_policy | 64:64 | async stream pipeline with staleness samples | | | | | | | | fully_async_policy | 64:64 | async stream pipeline with partial rollout | | | | | | | ->source data: https://wandb.ai/hou-zg-meituan/fully-async-policy?nw=nwuserhouzg +> source data: https://wandb.ai/hou-zg-meituan/fully-async-policy?nw=nwuserhouzg ### 128卡 require_batches 消融实验 @@ -313,7 +315,7 @@ python -m recipe.fully_async_policy.fully_async_main \ | fully_async_policy | 64:64 | 2 | | | | | | | | fully_async_policy | 64:64 | 4 | | | | | | | ->source data: https://wandb.ai/hou-zg-meituan/fully-async-policy?nw=nwuserhouzg +> source data: https://wandb.ai/hou-zg-meituan/fully-async-policy?nw=nwuserhouzg ### 128卡 stale 消融实验 @@ -324,7 +326,7 @@ python -m recipe.fully_async_policy.fully_async_main \ | fully_async_policy | 64:64 | 0.3 | | | | | | | | fully_async_policy | 64:64 | 0.5 | | | | | | | ->source data: https://wandb.ai/hou-zg-meituan/fully-async-policy?nw=nwuserhouzg +> source data: https://wandb.ai/hou-zg-meituan/fully-async-policy?nw=nwuserhouzg ## 后续计划 From ad595f719d8ec897672bf5bbd8945d678165085d Mon Sep 17 00:00:00 2001 From: ArronHZG Date: Tue, 14 Oct 2025 21:05:58 +0800 Subject: [PATCH 169/182] add ci --- .github/workflows/e2e_fully_async_policy.yml | 149 +++++++++++++++++++ docs/advance/fully_async.md | 0 docs/index.rst | 1 + tests/special_e2e/run_fully_async_policy.sh | 8 +- 4 files changed, 154 insertions(+), 4 deletions(-) create mode 100644 .github/workflows/e2e_fully_async_policy.yml create mode 100644 docs/advance/fully_async.md diff --git a/.github/workflows/e2e_fully_async_policy.yml b/.github/workflows/e2e_fully_async_policy.yml new file mode 100644 index 00000000000..e2cf0d1c061 --- /dev/null +++ b/.github/workflows/e2e_fully_async_policy.yml @@ -0,0 +1,149 @@ +# # Tests layout + +# Each folder under tests/ corresponds to a test category for a sub-namespace in verl. For instance: +# - `tests/trainer` for testing functionality related to `verl/trainer` +# - `tests/models` for testing functionality related to `verl/models` +# - ... + +# There are a few folders with `special_` prefix, created for special purposes: +# - `special_distributed`: unit tests that must run with multiple GPUs +# - `special_e2e`: end-to-end tests with training/generation scripts +# - `special_npu`: tests for NPUs +# - `special_sanity`: a suite of quick sanity tests +# - `special_standalone`: a set of test that are designed to run in dedicated environments + +# Accelerators for tests +# - By default tests are run with GPU available, except for the ones under `special_npu`, and any test script whose name ends with `on_cpu.py`. +# - For test scripts with `on_cpu.py` name suffix would be tested on CPU resources in linux environment. + +# # Workflow layout + +# All CI tests are configured by yaml files in `.github/workflows/`. Here's an overview of all test configs: +# 1. A list of always triggered CPU sanity tests: `check-pr-title.yml`, `secrets_scan.yml`, `check-pr-title,yml`, `pre-commit.yml`, `doc.yml` +# 2. Some heavy multi-GPU unit tests, such as `model.yml`, `vllm.yml`, `sgl.yml` +# 3. End-to-end tests: `e2e_*.yml` +# 4. Unit tests +# - `cpu_unit_tests.yml`, run pytest on all scripts with file name pattern `tests/**/test_*_on_cpu.py` +# - `gpu_unit_tests.yml`, run pytest on all scripts with file without the `on_cpu.py` suffix. +# - Since cpu/gpu unit tests by default runs all tests under `tests`, please make sure tests are manually excluded in them when +# - new workflow yaml is added to `.github/workflows` +# - new tests are added to workflow mentioned in 2. + + +name: e2e_fully_async_policy + +on: + # Trigger the workflow on push or pull request, + # but only for the main branch + # For push, for now only anti-patterns are specified so it is more conservative + # and achieves higher coverage. + push: + branches: + - main + - v0.* + paths: + - "**/*.py" + - "!**/*.md" + - "!**/*.sh" + # Other entrypoints + - "!examples/*trainer*" + - "!tests/**" + - "!verl/trainer/main_*.py" + - "!verl/trainer/fsdp_sft_trainer.py" + - "!recipe/**" + - "recipe/fully_async_policy" + pull_request: + branches: + - main + - v0.* + paths: + - "**/*.py" + - "!**/*.md" + - "!**/*.sh" + # Other entrypoints + - "!examples/**" + - "!tests/**" + - "!verl/trainer/main_*.py" + - "!verl/trainer/fsdp_sft_trainer.py" + # Other recipes + - "!recipe/**" + # Home + - "recipe/fully_async_policy" + # Entrypoints + - ".github/workflows/e2e_fully_async_policy.yml" + - "examples/data_preprocess/gsm8k.py" + - "tests/special_e2e/run_fully_async_policy.sh" + +# Cancel jobs on the same ref if a new one is triggered +concurrency: + group: ${{ github.workflow }}-${{ github.ref }} + cancel-in-progress: ${{ github.ref != 'refs/heads/main' }} + +# Declare permissions just read content. +permissions: + contents: read + +env: + IMAGE: "verl-ci-cn-beijing.cr.volces.com/verlai/verl:app-verl0.5-transformers4.55.4-vllm0.10.0-mcore0.13.0-te2.2" + DYNAMIC_RUNNER_ENDPOINT: "https://sd10g3clalm04ug7alq90.apigateway-cn-beijing.volceapi.com/runner" + TRANSFORMERS_VERSION: "4.56.2" + +jobs: + setup: + if: github.repository_owner == 'volcengine' + runs-on: ubuntu-latest + outputs: + runner-label: ${{ steps.create-runner.outputs.runner-label }} + mlp-task-id: ${{ steps.create-runner.outputs.mlp-task-id }} + steps: + - uses: actions/checkout@v4 + - id: create-runner + uses: volcengine/vemlp-github-runner@v1 + with: + mode: "create" + faas-url: "${{ env.DYNAMIC_RUNNER_ENDPOINT }}" + mlp-image: "${{ env.IMAGE }}" + + # Test FSDP2 strategy + e2e_fully_async_policy_fsdp2: + needs: setup + runs-on: [ "${{ needs.setup.outputs.runner-label || 'L20x8' }}" ] + timeout-minutes: 10 # Increase timeout for async training + env: + HTTP_PROXY: ${{ secrets.PROXY_HTTP }} + HTTPS_PROXY: ${{ secrets.PROXY_HTTPS }} + NO_PROXY: "localhost,127.0.0.1,hf-mirror.com" + HF_ENDPOINT: "https://hf-mirror.com" + HF_HUB_ENABLE_HF_TRANSFER: "0" # This is more stable + ACTOR_STRATEGY: "fsdp2" + steps: + - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 + with: + fetch-depth: 0 + - name: Install the current repository + run: | + pip3 install --no-deps -e .[test,gpu] + pip3 install transformers==$TRANSFORMERS_VERSION + - name: Prepare GSM8K dataset + run: | + python3 examples/data_preprocess/gsm8k.py --local_dataset_path ${HOME}/models/hf_data/gsm8k + - name: Running the E2E test with fully_async_policy algorithm (FSDP2) + run: | + ray stop --force + bash tests/special_e2e/run_fully_async_policy.sh + + cleanup: + runs-on: ubuntu-latest + needs: + [ + setup, + e2e_fully_async_policy_fsdp2 + ] + if: always() + steps: + - id: destroy-runner + uses: volcengine/vemlp-github-runner@v1 + with: + mode: "destroy" + faas-url: "${{ env.DYNAMIC_RUNNER_ENDPOINT }}" + mlp-task-id: "${{ needs.setup.outputs.mlp-task-id }}" \ No newline at end of file diff --git a/docs/advance/fully_async.md b/docs/advance/fully_async.md new file mode 100644 index 00000000000..e69de29bb2d diff --git a/docs/index.rst b/docs/index.rst index 68e37545dba..e8467dc965a 100644 --- a/docs/index.rst +++ b/docs/index.rst @@ -124,6 +124,7 @@ verl is fast with: advance/rollout_is_migration.md advance/one_step_off advance/agent_loop + advance/fully_async .. toctree:: :maxdepth: 1 diff --git a/tests/special_e2e/run_fully_async_policy.sh b/tests/special_e2e/run_fully_async_policy.sh index 096cb05c7a1..a2f99f0d67b 100644 --- a/tests/special_e2e/run_fully_async_policy.sh +++ b/tests/special_e2e/run_fully_async_policy.sh @@ -55,11 +55,11 @@ n_gpus_training=4 train_prompt_bsz=0 gen_prompt_bsz=1 n_resp_per_prompt=16 -train_prompt_mini_bsz=32 -total_rollout_steps=$(((128*2))) -test_freq=10 +train_prompt_mini_bsz=16 +total_rollout_steps=$(((128))) +test_freq=-1 staleness_threshold=0.1 -trigger_parameter_sync_step=16 +trigger_parameter_sync_step=4 partial_rollout=True exp_name="$(basename "${MODEL_ID,,}")-fully-async-policy-${ACTOR_STRATEGY}-minimal" From 1f51b0dfaea9105bf365d2bf07b7d48b23717a7f Mon Sep 17 00:00:00 2001 From: ArronHZG Date: Tue, 14 Oct 2025 21:32:58 +0800 Subject: [PATCH 170/182] update readme --- recipe/fully_async_policy/README_zh.md | 143 ++++++++++++++----------- verl/trainer/ppo/ray_trainer.py | 2 - 2 files changed, 83 insertions(+), 62 deletions(-) diff --git a/recipe/fully_async_policy/README_zh.md b/recipe/fully_async_policy/README_zh.md index 7d385e03abd..040dfe47dd1 100644 --- a/recipe/fully_async_policy/README_zh.md +++ b/recipe/fully_async_policy/README_zh.md @@ -83,68 +83,75 @@ https://github.com/ArronHZG/verl-community/blob/recipe/async_policy/docs/fully_a **进一步的解释:** -`rollout.total_rollout_steps` +* `rollout.total_rollout_steps` -与 colocate 相比,数量可以通过 train_batch_size 与 step 相乘对齐: rollout.total_rollout_steps = data.train_batch_size * -step。 + 与 colocate 相比,数量可以通过 train_batch_size 与 step 相乘对齐: + `rollout.total_rollout_steps = data.train_batch_size * step`。 -`async_training.trigger_parameter_sync_step` +* `async_training.trigger_parameter_sync_step` -在fully async策略中,表示Trainer进行多少次本地更新后(也就是获取多少次require_batches\* ppo_mini_batch_size数量样本), -与Rollouter之间进行一次参数同步。 -每两次Rollouter和Trainer参数同步之间,Trainer将会处理trigger_parameter_sync_step\* require_batches\* -ppo_mini_batch_size份sample。 -如果为了与colocate在公平的情况下对比速度,trigger_parameter_sync_step应该设置为 data.train_batch_size / ( -require_batches \* ppo_mini_batch_size)。 + 在fully async策略中,表示Trainer进行多少次本地更新后(也就是获取多少次`require_batches * ppo_mini_batch_size`数量样本), + 与Rollouter之间进行一次参数同步。 + 每两次Rollouter和Trainer参数同步之间,Trainer将会处理`trigger_parameter_sync_step* require_batches\ + ppo_mini_batch_size`份sample。 + 如果为了与colocate在公平的情况下对比速度,trigger_parameter_sync_step应该设置为 `data.train_batch_size / ( + require_batches * ppo_mini_batch_size)`。 -`async_training.staleness_threshold` +* `async_training.staleness_threshold` -在fully async策略中,表示最大允许使用的staleness样本的比例。 + 在fully async策略中,表示最大允许使用的staleness样本的比例。 -* staleness_threshold=0,表示同步训练。 - Rollouter两次参数更新之间将会生成固定数量的样本,样本数为: - $$rollout\_num = (trigger\_parameter\_sync\_step*require\_batches*ppo\_mini\_batch\_size)$$ -* staleness_threshold>0,表示异步训练, 可以设置为小数,支持更灵活的异步调用。 - Rollouter两次参数更新之间将会最多生成的样本数为: - $$rollout\_num = (1+staleness\_threshold)*(trigger\_parameter\_sync\_step*require\_batches*ppo\_mini\_batch\_size) - num\_staleness\_sample $$ + * staleness_threshold=0,表示同步训练。 + Rollouter两次参数更新之间将会生成固定数量的样本,样本数为: + $$rollout\_num = (trigger\_parameter\_sync\_step*require\_batches*ppo\_mini\_batch\_size)$$ + * staleness_threshold>0,表示异步训练, 可以设置为小数,支持更灵活的异步调用。 + Rollouter两次参数更新之间将会最多生成的样本数为: + $$rollout\_num = (1+staleness\_threshold)*(trigger\_parameter\_sync\_step*require\_batches*ppo\_mini\_batch\_size) - num\_staleness\_sample $$ -num_staleness_sample 表示上一次rollout多生成的陈旧样本数。 + num_staleness_sample 表示上一次rollout多生成的陈旧样本数。 -由于是流式系统,rollout持续生成,trainer持续消费。如果rollouter较慢,trainer会更早触发参数同步,rollouter并不会实际生产rollout_num个样本。 -当rollout 足够快时,staleness_threshold设置为1,基本上等价于one_step_off policy。 -为了避免过期样本太多影响训练精度,建议该值设置小于1。 + 由于是流式系统,rollout持续生成,trainer持续消费。如果rollouter较慢,trainer会更早触发参数同步,rollouter并不会实际生产rollout_num个样本。 + 当rollout 足够快时,staleness_threshold设置为1,基本上等价于one_step_off policy。 + 为了避免过期样本太多影响训练精度,建议该值设置小于1。 -`async_training.partial_rollout` +* `async_training.partial_rollout` -partial_rollout只会在staleness_threshold>0时才实际上起作用。 + partial_rollout只会在staleness_threshold>0时才实际上起作用。 -`async_training.use_rollout_log_probs` +* `async_training.use_rollout_log_probs` -在强化学习算法中,log_probs与参数版本,token都存在隐性的相关性。由于PPO/GRPO/DAPO等算法的设定,我们在计算重要性采样时, -即 old_log_prob必须使用rollout参数及token所对应log_probs,才能保证算法的正确性。在fully -async策略中,我们默认old_log_prob是有rollout所计算的,而不是由trainer所计算。 + 在强化学习算法中,log_probs与参数版本,token都存在隐性的相关性。由于PPO/GRPO/DAPO等算法的设定,我们在计算重要性采样时, + 即 old_log_prob必须使用rollout参数及token所对应log_probs,才能保证算法的正确性。在fully + async策略中,我们默认old_log_prob是有rollout所计算的,而不是由trainer所计算。 + + * `async_training.require_batches` + + 在流式训练中,require_batches 应该设置为1,表示生产够ppo_mini_batch_size样本后,就进行训练。 + 在实际测试中,我们发现,如果单次下发的样本较少,由于数据分发的顺序,会导致训练不稳定,response 长度变长。 + 在这里,我们额外提供 require_batches 进行流式分发,单次参与训练的样本数量控制。 + ### 模式支持 1. on policy pipeline: - 1. trigger_parameter_sync_step=1,staleness_threshold=0; - 2. Rollouter一次生产require_batches* - ppo_mini_batch_size的samples,Trainer获取这些samples后进行训练,训练完后Trainer和Rollouter之间进行一次参数同步; + 1. **trigger_parameter_sync_step=1,staleness_threshold=0** + 2. Rollouter一次生产`require_batches*ppo_mini_batch_size` + 的samples,Trainer获取这些samples后进行训练,训练完后Trainer和Rollouter之间进行一次参数同步; 3. 在rollout阶段,如果存在长尾的样本,但是rollout样本数较少时,较短的样本无法填充到空闲的资源中,会造成一定的资源浪费。 4. 如图a所示; 2. stream off policy pipeline: - 1. trigger_parameter_sync_step>1,staleness_threshold=0。 - 2. 将会进行同步的流式训练,Rollouter一次生产require_batches*ppo_mini_batch_size* - trigger_parameter_sync_step的samples,Trainer每获取require_batches* - ppo_mini_batch_size就进行一次本地训练,训练trigger_parameter_sync_step次后,Trainer和Rollouter之间进行一次参数同步; + 1. **trigger_parameter_sync_step>1,staleness_threshold=0** + 2. 将会进行同步的流式训练,Rollouter一次生产`require_batches*ppo_mini_batch_size*trigger_parameter_sync_step` + 的samples,Trainer每获取`require_batches*ppo_mini_batch_size` + 就进行一次本地训练,训练trigger_parameter_sync_step次后,Trainer和Rollouter之间进行一次参数同步; 3. 相较于a,由于一次生成的样本更多,资源的空闲会更低。 - 4. 在一次step训练中,会存在两次资源闲置的时间,分别是在第一次获取样本时,train等待require_batches* - ppo_mini_batch_size个样本生产,以及最后一次参数更新时,rollout等待训练完成。 + 4. 在一次step训练中,会存在两次资源闲置的时间,分别是在第一次获取样本时,train等待`require_batches*ppo_mini_batch_size` + 个样本生产,以及最后一次参数更新时,rollout等待训练完成。 5. 如图b所示; 3. async stream pipeline with staleness samples: - 1. trigger_parameter_sync_step>=1,staleness_threshold>0,partial_rollout=Flase。 + 1. **trigger_parameter_sync_step>=1,staleness_threshold>0,partial_rollout=Flase** 2. Rollouter在每次参数更新后将计划最多生产rollout_num个样本(实际根据rollout速度,生成的样本可能会少与这个值)。 3. 如果rollout过程比较快,Rollouter将会在参数同步前额外生成一部分样本num_stale_samples,用于参数同步后立即给Trainer使用。 触发参数同步时,如果Rollouter有正在生产的任务,将会等待任务完成,同时不会添加新的任务; @@ -153,7 +160,7 @@ async策略中,我们默认old_log_prob是有rollout所计算的,而不是 5. 如图c所示; 4. async stream pipeline with partial rollout: - 1. trigger_parameter_sync_step>=1,staleness_threshold>0,partial_rollout=True。 + 1. **trigger_parameter_sync_step>=1,staleness_threshold>0,partial_rollout=True** 2. 相较于c,触发参数同步时,Rollouter如果有正在生产的sample,会打断rollout过程并进行参数同步,被中断的sample会在参数同步后继续生成。减少了wait active task finish的时间。 3. 如图d所示; @@ -245,6 +252,8 @@ python -m recipe.fully_async_policy.fully_async_main \ ### 在7B模型上进行异步训练 +我们使用 Qwen2.5-Math-7B 验证 fully async 策略在长候选下,各个资源的收益。 + * 机器:H20 * 模型:Qwen2.5-Math-7B * rollout长度:max_response_length FSDP2: 28K tokens; @@ -277,6 +286,41 @@ python -m recipe.fully_async_policy.fully_async_main \ > source data: https://wandb.ai/hou-zg-meituan/fully-async-policy?nw=nwuserhouzg +### 128卡 7B 异步模式实验 + +我们使用 Qwen2.5-Math-7B 验证 fully async 所支持的各个模型的效果。 + +| training mode | Resource allocation | mode | step | generate_sequences | old_log_prob | update_actor | total time | acc/best@32/mean | +|--------------------|---------------------|------------------------------------------------|------|--------------------|--------------|--------------|------------|------------------| +| fully_async_policy | 64:64 | `stream off policy pipeline` | | | | | | | +| fully_async_policy | 64:64 | `async stream pipeline with staleness samples` | | | | | | | +| fully_async_policy | 64:64 | `async stream pipeline with partial rollout` | | | | | | | + +### 128卡 stale 消融实验 + +在 `async stream pipeline with partial rollout` 模式下,我们验证 staleness 的设置对于训练效率的影响。 + +| training mode | Resource allocation | staleness | step | generate_sequences | old_log_prob | update_actor | total time | acc/best@32/mean | +|--------------------|---------------------|-----------|------|--------------------|--------------|--------------|------------|------------------| +| fully_async_policy | 64:64 | 0 | | | | | | | +| fully_async_policy | 64:64 | 0.1 | | | | | | | +| fully_async_policy | 64:64 | 0.3 | | | | | | | +| fully_async_policy | 64:64 | 0.5 | | | | | | | + +> source data: https://wandb.ai/hou-zg-meituan/fully-async-policy?nw=nwuserhouzg + +### 128卡 7B require_batches 消融实验 + +在多次测试下,我们发现流式每次下发样本的数量,会影响训练的结果,我们通过修改 `async_training.require_batches` 验证对与结果的影响。 + +| training mode | Resource allocation | async_training.require_batches | step | generate_sequences | old_log_prob | update_actor | total time | acc/best@32/mean | +|--------------------|---------------------|--------------------------------|------|--------------------|--------------|--------------|------------|------------------| +| fully_async_policy | 64:64 | 1 | | | | | | | +| fully_async_policy | 64:64 | 2 | | | | | | | +| fully_async_policy | 64:64 | 4 | | | | | | | + +> source data: https://wandb.ai/hou-zg-meituan/fully-async-policy?nw=nwuserhouzg + ### 30B模型模式实验 * 机器: H20 @@ -307,27 +351,6 @@ python -m recipe.fully_async_policy.fully_async_main \ > source data: https://wandb.ai/hou-zg-meituan/fully-async-policy?nw=nwuserhouzg -### 128卡 require_batches 消融实验 - -| training mode | Resource allocation | require_size | step | generate_sequences | old_log_prob | update_actor | total time | acc/best@32/mean | -|--------------------|---------------------|--------------|------|--------------------|--------------|--------------|------------|------------------| -| fully_async_policy | 64:64 | 1 | | | | | | | -| fully_async_policy | 64:64 | 2 | | | | | | | -| fully_async_policy | 64:64 | 4 | | | | | | | - -> source data: https://wandb.ai/hou-zg-meituan/fully-async-policy?nw=nwuserhouzg - -### 128卡 stale 消融实验 - -| training mode | Resource allocation | staleness | step | generate_sequences | old_log_prob | update_actor | total time | acc/best@32/mean | -|--------------------|---------------------|-----------|------|--------------------|--------------|--------------|------------|------------------| -| fully_async_policy | 64:64 | 0 | | | | | | | -| fully_async_policy | 64:64 | 0.1 | | | | | | | -| fully_async_policy | 64:64 | 0.3 | | | | | | | -| fully_async_policy | 64:64 | 0.5 | | | | | | | - -> source data: https://wandb.ai/hou-zg-meituan/fully-async-policy?nw=nwuserhouzg - ## 后续计划 * GRPO实验 diff --git a/verl/trainer/ppo/ray_trainer.py b/verl/trainer/ppo/ray_trainer.py index 4a97158809e..0aae90517e8 100644 --- a/verl/trainer/ppo/ray_trainer.py +++ b/verl/trainer/ppo/ray_trainer.py @@ -602,11 +602,9 @@ def _validate(self): sample_scores.extend(scores) reward_extra_infos_dict["reward"].extend(scores) - print(f"len reward_extra_infos_dict['reward']: {len(reward_extra_infos_dict['reward'])}") if "reward_extra_info" in result: for key, lst in result["reward_extra_info"].items(): reward_extra_infos_dict[key].extend(lst) - print(f"len reward_extra_infos_dict['{key}']: {len(reward_extra_infos_dict[key])}") # collect num_turns of each prompt if "__num_turns__" in test_batch.non_tensor_batch: From ead757a95e70386dcfb01641ce9bbe6abd2da594 Mon Sep 17 00:00:00 2001 From: arron Date: Thu, 16 Oct 2025 15:08:56 +0800 Subject: [PATCH 171/182] fix ci --- .../agent_loop/agent_loop.py | 38 +++++----- .../config/fully_async_ppo_trainer.yaml | 4 ++ recipe/fully_async_policy/param_sync.py | 4 +- .../vllm_rollout/vllm_async_server.py | 51 +++++++------- verl/trainer/config/actor/dp_actor.yaml | 5 +- verl/workers/actor/dp_actor.py | 14 ++-- .../rollout/vllm_rollout/vllm_async_server.py | 70 +++++++++---------- 7 files changed, 97 insertions(+), 89 deletions(-) diff --git a/recipe/fully_async_policy/agent_loop/agent_loop.py b/recipe/fully_async_policy/agent_loop/agent_loop.py index 8dc7bbf609f..55489d705a5 100644 --- a/recipe/fully_async_policy/agent_loop/agent_loop.py +++ b/recipe/fully_async_policy/agent_loop/agent_loop.py @@ -70,13 +70,13 @@ class FullyAsyncAgentLoopOutput(AgentLoopOutput): @ray.remote class FullyAsyncAgentLoopWorker(AgentLoopWorkerBase): def __init__( - self, config: DictConfig, server_handles: list[ray.actor.ActorHandle], rm_executor: BatchExecutor = None + self, config: DictConfig, server_handles: list[ray.actor.ActorHandle], rm_executor: BatchExecutor = None ): self.server_manager = FullyAsyncLLMServerManager(config, server_handles) super().__init__(config, server_handles, rm_executor) async def generate_sequences_no_post( - self, batch: DataProto, partial_output_list: Optional[list[AgentLoopOutput]] + self, batch: DataProto, partial_output_list: Optional[list[AgentLoopOutput]] ) -> list[AgentLoopOutput]: """Generate sequences from agent loop. @@ -126,19 +126,19 @@ async def generate_sequences_no_post( return await asyncio.gather(*tasks) async def _partial_run_agent_loop( - self, - sampling_params: dict[str, Any], - trajectory: dict[str, Any], - *, - agent_name: str, - **kwargs, + self, + sampling_params: dict[str, Any], + trajectory: dict[str, Any], + *, + agent_name: str, + **kwargs, ) -> AgentLoopOutput: with rollout_trace_attr( - step=trajectory["step"], - sample_index=trajectory["sample_index"], - rollout_n=trajectory["rollout_n"], - validate=trajectory["validate"], - name="agent_loop", + step=trajectory["step"], + sample_index=trajectory["sample_index"], + rollout_n=trajectory["rollout_n"], + validate=trajectory["validate"], + name="agent_loop", ): assert agent_name in _agent_loop_registry, ( f"Agent loop {agent_name} not registered, registered agent loops: {_agent_loop_registry.keys()}" @@ -215,8 +215,10 @@ async def _initialize_llm_servers_async(self): model_config = self.config.actor_rollout_ref.model self.rollout_replicas = [ self.rollout_replica_class( - replica_rank=replica_rank, config=rollout_config, - model_config=model_config, gpus_per_node=self.config.trainer.n_gpus_per_node + replica_rank=replica_rank, + config=rollout_config, + model_config=model_config, + gpus_per_node=self.config.trainer.n_gpus_per_node, ) for replica_rank in range(num_replicas) ] @@ -230,9 +232,9 @@ async def _initialize_llm_servers_async(self): self.server_addresses = [server._server_address for server in self.rollout_replicas] async def generate_single_sample_async( - self, - sample: DataProto, - partial_output_list: Optional[list[AgentLoopOutput]], + self, + sample: DataProto, + partial_output_list: Optional[list[AgentLoopOutput]], ) -> list[AgentLoopOutput]: """ Asynchronously process a single sample diff --git a/recipe/fully_async_policy/config/fully_async_ppo_trainer.yaml b/recipe/fully_async_policy/config/fully_async_ppo_trainer.yaml index 84a3cb7c290..17c3b925476 100644 --- a/recipe/fully_async_policy/config/fully_async_ppo_trainer.yaml +++ b/recipe/fully_async_policy/config/fully_async_ppo_trainer.yaml @@ -48,3 +48,7 @@ rollout: data: # Number of samples generated, currently only support 1 gen_batch_size: 1 + +actor: + # Whether to use rollout log probs for training + use_rollout_log_probs: ${oc.select:async_training.use_rollout_log_probs, True} diff --git a/recipe/fully_async_policy/param_sync.py b/recipe/fully_async_policy/param_sync.py index d6c67ceb409..2fdcbb919db 100644 --- a/recipe/fully_async_policy/param_sync.py +++ b/recipe/fully_async_policy/param_sync.py @@ -18,6 +18,8 @@ import ray from ray.util.collective import collective +from verl.utils.device import get_nccl_backend + logger = logging.getLogger(__name__) @@ -69,7 +71,7 @@ def _init_sync_group(self): actor_rollout_workers, len(actor_rollout_workers), list(range(0, len(actor_rollout_workers))), - backend="nccl", + backend=get_nccl_backend(), group_name=self.sync_group_name, ) diff --git a/recipe/fully_async_policy/vllm_rollout/vllm_async_server.py b/recipe/fully_async_policy/vllm_rollout/vllm_async_server.py index 7a4dc8e7d7d..93381e1bff0 100644 --- a/recipe/fully_async_policy/vllm_rollout/vllm_async_server.py +++ b/recipe/fully_async_policy/vllm_rollout/vllm_async_server.py @@ -16,13 +16,12 @@ from typing import Any, Optional, Sequence import ray -from omegaconf import DictConfig from ray.actor import ActorHandle from vllm import SamplingParams from vllm.inputs import TokensPrompt from vllm.outputs import RequestOutput -from verl.workers.config import RolloutConfig, RewardModelConfig, HFModelConfig +from verl.workers.config import HFModelConfig, RewardModelConfig, RolloutConfig from verl.workers.rollout.replica import RolloutMode from verl.workers.rollout.vllm_rollout.vllm_async_server import ( _qwen2_5_vl_dedup_image_tokens, @@ -37,15 +36,15 @@ @ray.remote(num_cpus=1) class vLLMHttpServerForPartial(vLLMHttpServerBase): def __init__( - self, - config: RolloutConfig | RewardModelConfig, - model_config: HFModelConfig, - rollout_mode: RolloutMode, - workers: list[ActorHandle], - replica_rank: int, - node_rank: int, - gpus_per_node: int, - nnodes: int, + self, + config: RolloutConfig | RewardModelConfig, + model_config: HFModelConfig, + rollout_mode: RolloutMode, + workers: list[ActorHandle], + replica_rank: int, + node_rank: int, + gpus_per_node: int, + nnodes: int, ): super().__init__(config, model_config, rollout_mode, workers, replica_rank, node_rank, gpus_per_node, nnodes) @@ -56,11 +55,11 @@ def __init__( self.req_output: dict[str, Optional[RequestOutput]] = {} async def _generate_step( - self, - prompt_ids: list[int], - sampling_params: dict[str, Any], - request_id: str, - image_data: Optional[list[Any]] = None, + self, + prompt_ids: list[int], + sampling_params: dict[str, Any], + request_id: str, + image_data: Optional[list[Any]] = None, ): max_tokens = self.config.max_model_len - len(prompt_ids) sampling_params["logprobs"] = 1 @@ -79,11 +78,11 @@ async def _generate_step( assert self.req_output[request_id] is not None async def generate_for_partial( - self, - prompt_ids: list[int], - sampling_params: dict[str, Any], - request_id: str, - image_data: Optional[list[Any]] = None, + self, + prompt_ids: list[int], + sampling_params: dict[str, Any], + request_id: str, + image_data: Optional[list[Any]] = None, ) -> tuple[list[Any], list[Any], bool] | tuple[Sequence[int], list[float], Any]: async with self.lock: if self.paused: @@ -133,11 +132,11 @@ async def reset_prefix_cache(self): class FullyAsyncvLLMReplica(vLLMReplica): def __init__( - self, - replica_rank: int, - config: RolloutConfig | RewardModelConfig, - model_config: HFModelConfig, - gpus_per_node: int = 8, + self, + replica_rank: int, + config: RolloutConfig | RewardModelConfig, + model_config: HFModelConfig, + gpus_per_node: int = 8, ): super().__init__(replica_rank, config, model_config, gpus_per_node) self.server_class = vLLMHttpServerForPartial diff --git a/verl/trainer/config/actor/dp_actor.yaml b/verl/trainer/config/actor/dp_actor.yaml index 9969f7635b9..a2ff54d4854 100644 --- a/verl/trainer/config/actor/dp_actor.yaml +++ b/verl/trainer/config/actor/dp_actor.yaml @@ -39,7 +39,4 @@ entropy_from_logits_with_chunking: False entropy_checkpointing: False # Whether to remove padding tokens in inputs during training -use_remove_padding: ${oc.select:actor_rollout_ref.model.use_remove_padding,false} - -# Whether it's a hybrid engine -hybrid_engine: ${oc.select:actor_rollout_ref.hybrid_engine, True} \ No newline at end of file +use_remove_padding: ${oc.select:actor_rollout_ref.model.use_remove_padding,false} \ No newline at end of file diff --git a/verl/workers/actor/dp_actor.py b/verl/workers/actor/dp_actor.py index fe78e360365..7dd531ad266 100644 --- a/verl/workers/actor/dp_actor.py +++ b/verl/workers/actor/dp_actor.py @@ -78,7 +78,7 @@ def __init__(self, config: ActorConfig, actor_module: nn.Module, actor_optimizer self.compute_entropy_from_logits = ( torch.compile(entropy_from_logits, dynamic=True) - if self.config.get("use_torch_compile", True) # use torch compile by default + if self.config.get("use_torch_compile", True) # use torch compile by default else entropy_from_logits ) self.device_name = get_device_name() @@ -387,7 +387,7 @@ def update_policy(self, data: DataProto): # See PPO paper for details. https://arxiv.org/abs/1707.06347 mini_batches = data.split(self.config.ppo_mini_batch_size) - on_policy = len(mini_batches) == 1 and self.config.ppo_epochs == 1 and self.config.hybrid_engine + on_policy = len(mini_batches) == 1 and self.config.ppo_epochs == 1 metrics = {} for _ in range(self.config.ppo_epochs): @@ -427,10 +427,14 @@ def update_policy(self, data: DataProto): model_inputs, temperature=temperature, calculate_entropy=calculate_entropy ) - if on_policy: - old_log_prob = log_prob.detach() - else: + # for fully_async_policy recipe + if hasattr(self.config, "use_rollout_log_probs") and self.config.use_rollout_log_probs: old_log_prob = model_inputs["old_log_probs"] + else: + if on_policy: + old_log_prob = log_prob.detach() + else: + old_log_prob = model_inputs["old_log_probs"] loss_mode = self.config.policy_loss.get("loss_mode", "vanilla") # vanilla -> verl.trainer.ppo.core_algos.compute_policy_loss_vanilla diff --git a/verl/workers/rollout/vllm_rollout/vllm_async_server.py b/verl/workers/rollout/vllm_rollout/vllm_async_server.py index b82d4013cc0..42028ba7a4f 100644 --- a/verl/workers/rollout/vllm_rollout/vllm_async_server.py +++ b/verl/workers/rollout/vllm_rollout/vllm_async_server.py @@ -61,7 +61,7 @@ def _init_executor(self) -> None: tp_size = self.vllm_config.parallel_config.tensor_parallel_size addresses = os.environ["VERL_VLLM_ZMQ_ADDRESSES"].split(",") - addresses = addresses[dp_rank_local * tp_size: (dp_rank_local + 1) * tp_size] + addresses = addresses[dp_rank_local * tp_size : (dp_rank_local + 1) * tp_size] self.context = zmq.Context() self.sockets = [] for address in addresses: @@ -81,11 +81,11 @@ def _init_executor(self) -> None: self.collective_rpc("load_model") def collective_rpc( - self, - method: str | Callable, - timeout: Optional[float] = None, - args: tuple = (), - kwargs: Optional[dict[str, Any]] = None, + self, + method: str | Callable, + timeout: Optional[float] = None, + args: tuple = (), + kwargs: Optional[dict[str, Any]] = None, ) -> list[Any]: if isinstance(method, str): sent_method = method @@ -114,15 +114,15 @@ class vLLMHttpServerBase: """ def __init__( - self, - config: RolloutConfig | RewardModelConfig, - model_config: HFModelConfig, - rollout_mode: RolloutMode, - workers: list[ActorHandle], - replica_rank: int, - node_rank: int, - gpus_per_node: int, - nnodes: int, + self, + config: RolloutConfig | RewardModelConfig, + model_config: HFModelConfig, + rollout_mode: RolloutMode, + workers: list[ActorHandle], + replica_rank: int, + node_rank: int, + gpus_per_node: int, + nnodes: int, ): """ Args: @@ -337,11 +337,11 @@ async def run_headless(self, args: argparse.Namespace): ) async def generate( - self, - prompt_ids: list[int], - sampling_params: dict[str, Any], - request_id: str, - image_data: Optional[list[Any]] = None, + self, + prompt_ids: list[int], + sampling_params: dict[str, Any], + request_id: str, + image_data: Optional[list[Any]] = None, ) -> TokenOutput: """Generate sequence with token-in-token-out.""" # TODO(@wuxibin): switch to `/generate` http endpoint once multi-modal support ready. @@ -403,15 +403,15 @@ class vLLMHttpServer(vLLMHttpServerBase): """ def __init__( - self, - config: RolloutConfig | RewardModelConfig, - model_config: HFModelConfig, - rollout_mode: RolloutMode, - workers: list[ActorHandle], - replica_rank: int, - node_rank: int, - gpus_per_node: int, - nnodes: int, + self, + config: RolloutConfig | RewardModelConfig, + model_config: HFModelConfig, + rollout_mode: RolloutMode, + workers: list[ActorHandle], + replica_rank: int, + node_rank: int, + gpus_per_node: int, + nnodes: int, ): super().__init__(config, model_config, rollout_mode, workers, replica_rank, node_rank, gpus_per_node, nnodes) @@ -421,11 +421,11 @@ def __init__( class vLLMReplica(RolloutReplica): def __init__( - self, - replica_rank: int, - config: RolloutConfig | RewardModelConfig, - model_config: HFModelConfig, - gpus_per_node: int = 8, + self, + replica_rank: int, + config: RolloutConfig | RewardModelConfig, + model_config: HFModelConfig, + gpus_per_node: int = 8, ): super().__init__(replica_rank, config, model_config, gpus_per_node) self.server_class = vLLMHttpServer @@ -462,7 +462,7 @@ async def launch_servers(self): # create server actor in each node with node affinity for node_rank in range(nnodes): - workers = self.workers[node_rank * gpus_per_node: (node_rank + 1) * gpus_per_node] + workers = self.workers[node_rank * gpus_per_node : (node_rank + 1) * gpus_per_node] node_id = worker_node_ids[node_rank * gpus_per_node] server = self.server_class.options( scheduling_strategy=ray.util.scheduling_strategies.NodeAffinitySchedulingStrategy( From 2383a157410c6004fa7963f3a3c6c4e576e7cd42 Mon Sep 17 00:00:00 2001 From: ArronHZG Date: Thu, 16 Oct 2025 15:56:40 +0800 Subject: [PATCH 172/182] fix some ci --- docs/advance/fully_async.md | 5 +++++ recipe/fully_async_policy/README_zh.md | 2 +- .../config/fully_async_ppo_trainer.yaml | 17 +++++++++-------- recipe/fully_async_policy/param_sync.py | 4 +--- tests/special_e2e/run_fully_async_policy.sh | 2 +- tests/special_sanity/check_device_api_usage.py | 1 + verl/workers/actor/dp_actor.py | 1 + verl/workers/config/actor.py | 2 +- 8 files changed, 20 insertions(+), 14 deletions(-) diff --git a/docs/advance/fully_async.md b/docs/advance/fully_async.md index e69de29bb2d..aa9e33ff99c 100644 --- a/docs/advance/fully_async.md +++ b/docs/advance/fully_async.md @@ -0,0 +1,5 @@ +# Recipe: Fully Async Policy Async Trainer + +**Author:** `https://github.com/meituan-search` + +Last updated: 10/16/2025. \ No newline at end of file diff --git a/recipe/fully_async_policy/README_zh.md b/recipe/fully_async_policy/README_zh.md index 040dfe47dd1..0a43a7ec406 100644 --- a/recipe/fully_async_policy/README_zh.md +++ b/recipe/fully_async_policy/README_zh.md @@ -2,7 +2,7 @@ **Author:** `https://github.com/meituan-search` -Last updated: 10/13/2025. +Last updated: 10/16/2025. 本文档介绍了完全异步PPO训练系统,该系统实现了 Trainer 和 Rollouter 的完全解耦,支持异步样本生成和训练。 diff --git a/recipe/fully_async_policy/config/fully_async_ppo_trainer.yaml b/recipe/fully_async_policy/config/fully_async_ppo_trainer.yaml index 17c3b925476..4a8b8fc32e7 100644 --- a/recipe/fully_async_policy/config/fully_async_ppo_trainer.yaml +++ b/recipe/fully_async_policy/config/fully_async_ppo_trainer.yaml @@ -10,19 +10,19 @@ async_training: # Maximum samples staleness threshold staleness_threshold: 0.1 - + # Frequency of parameter synchronization between rollouter and trainer, # One step means trainer obtains a batch of required samples trigger_parameter_sync_step: 4 # The number of ppo_mini_batches that the FullyAsyncTrainer obtains once - require_batches: 1 + require_batches: 1 # When synchronizing parameters, whether to interrupt rollouter and perform partial rollout partial_rollout: True # Whether to use rollout log probs for training - use_rollout_log_probs: True + use_rollout_log_probs: True # Rollout config rollout: @@ -34,7 +34,7 @@ rollout: n_gpus_per_node: 8 # number of responses (i.e. num sample times). > 1 for grpo - n: 4 + n: 4 # total rollout samples # TODO rename to total_rollout_samples total_rollout_steps: 100 @@ -43,12 +43,13 @@ rollout: total_epochs: 10 # Test frequency, how many times a parameter update triggers a validation - test_freq: 1 + test_freq: 1 data: # Number of samples generated, currently only support 1 gen_batch_size: 1 -actor: - # Whether to use rollout log probs for training - use_rollout_log_probs: ${oc.select:async_training.use_rollout_log_probs, True} +actor_rollout_ref: + actor: + # Whether to use rollout log probs for training + use_rollout_log_probs: ${oc.select:async_training.use_rollout_log_probs, True} diff --git a/recipe/fully_async_policy/param_sync.py b/recipe/fully_async_policy/param_sync.py index 2fdcbb919db..d6c67ceb409 100644 --- a/recipe/fully_async_policy/param_sync.py +++ b/recipe/fully_async_policy/param_sync.py @@ -18,8 +18,6 @@ import ray from ray.util.collective import collective -from verl.utils.device import get_nccl_backend - logger = logging.getLogger(__name__) @@ -71,7 +69,7 @@ def _init_sync_group(self): actor_rollout_workers, len(actor_rollout_workers), list(range(0, len(actor_rollout_workers))), - backend=get_nccl_backend(), + backend="nccl", group_name=self.sync_group_name, ) diff --git a/tests/special_e2e/run_fully_async_policy.sh b/tests/special_e2e/run_fully_async_policy.sh index a2f99f0d67b..e5386c9e4fe 100644 --- a/tests/special_e2e/run_fully_async_policy.sh +++ b/tests/special_e2e/run_fully_async_policy.sh @@ -143,7 +143,7 @@ if [ "${ACTOR_STRATEGY}" == "fsdp2" ]; then ref_offload=True actor_offload=False - python3 -m recipe.fully_async_policy.fully_async_main \ + /home/hadoop-ai-search/miniconda3/bin/python -m recipe.fully_async_policy.fully_async_main \ "${common_params[@]}" \ actor_rollout_ref.actor.strategy=fsdp2 \ critic.strategy=fsdp2 \ diff --git a/tests/special_sanity/check_device_api_usage.py b/tests/special_sanity/check_device_api_usage.py index dae5ac4b43d..8d3cfda27c8 100644 --- a/tests/special_sanity/check_device_api_usage.py +++ b/tests/special_sanity/check_device_api_usage.py @@ -48,6 +48,7 @@ NCCL_KEYWORD_CHECK_WHITELIST = [ "verl/utils/device.py", "verl/third_party/sglang/parallel_state.py", # appear in default backend + "verl/recipe/fully_async_policy/param_sync.py", # fully_async_policy in default backend ] SEARCH_WHITELIST = CUDA_KEYWORD_CHECK_WHITELIST + NCCL_KEYWORD_CHECK_WHITELIST diff --git a/verl/workers/actor/dp_actor.py b/verl/workers/actor/dp_actor.py index 7dd531ad266..5955dfc33ed 100644 --- a/verl/workers/actor/dp_actor.py +++ b/verl/workers/actor/dp_actor.py @@ -429,6 +429,7 @@ def update_policy(self, data: DataProto): # for fully_async_policy recipe if hasattr(self.config, "use_rollout_log_probs") and self.config.use_rollout_log_probs: + print("for fully_async_policy recipe") old_log_prob = model_inputs["old_log_probs"] else: if on_policy: diff --git a/verl/workers/config/actor.py b/verl/workers/config/actor.py index 1ccab8e41c7..fe5b3e1193a 100644 --- a/verl/workers/config/actor.py +++ b/verl/workers/config/actor.py @@ -231,7 +231,7 @@ class FSDPActorConfig(ActorConfig): fsdp_config: FSDPEngineConfig = field(default_factory=FSDPEngineConfig) use_remove_padding: bool = False profiler: ProfilerConfig = field(default_factory=ProfilerConfig) - hybrid_engine: bool = True + use_rollout_log_probs: bool = False def __post_init__(self): """Validate FSDP actor configuration parameters.""" From 7298b65d6381442c470b128faa1e9ffac7d22c85 Mon Sep 17 00:00:00 2001 From: ArronHZG Date: Thu, 16 Oct 2025 16:01:16 +0800 Subject: [PATCH 173/182] fix e2e_fully_async_policy_fsdp2 --- tests/special_e2e/run_fully_async_policy.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/special_e2e/run_fully_async_policy.sh b/tests/special_e2e/run_fully_async_policy.sh index e5386c9e4fe..a2f99f0d67b 100644 --- a/tests/special_e2e/run_fully_async_policy.sh +++ b/tests/special_e2e/run_fully_async_policy.sh @@ -143,7 +143,7 @@ if [ "${ACTOR_STRATEGY}" == "fsdp2" ]; then ref_offload=True actor_offload=False - /home/hadoop-ai-search/miniconda3/bin/python -m recipe.fully_async_policy.fully_async_main \ + python3 -m recipe.fully_async_policy.fully_async_main \ "${common_params[@]}" \ actor_rollout_ref.actor.strategy=fsdp2 \ critic.strategy=fsdp2 \ From 0730b75275b7d3a8e2d8a40a3dbfdea6290b0d8b Mon Sep 17 00:00:00 2001 From: wangshulin02 <953550366@qq.com> Date: Thu, 16 Oct 2025 16:15:22 +0800 Subject: [PATCH 174/182] update readme exp --- recipe/fully_async_policy/README_zh.md | 48 +++++++++++++------------- 1 file changed, 24 insertions(+), 24 deletions(-) diff --git a/recipe/fully_async_policy/README_zh.md b/recipe/fully_async_policy/README_zh.md index 040dfe47dd1..b3c4ffe83d0 100644 --- a/recipe/fully_async_policy/README_zh.md +++ b/recipe/fully_async_policy/README_zh.md @@ -275,14 +275,14 @@ python -m recipe.fully_async_policy.fully_async_main \ * staleness_threshold: 0.3 * partial_rollout: True -| training mode | Resource allocation | step | generate_sequences | old_log_prob | update_actor | total time | acc/best@32/mean | -|--------------------|---------------------|------|--------------------|--------------|--------------|------------|------------------| -| colocate sync | 32 | | | | | | | -| fully_async_policy | 16:16 | | | | | | | -| colocate sync | 64 | | | | | | | -| fully_async_policy | 32:32 | | | | | | | -| colocate sync | 128 | | | | | | | -| fully_async_policy | 64:64 | | | | | | | +| training mode | resource allocation | step | gen | old_log_prob | update_actor | total time
100 step | total time
200 step | total time
300 step | total time
400 step | acc/mean@1 | +|:------------------: |:-------------------: |:------: |:------: |:------------: |:------------: |:----------------------: |:----------------------: |:----------------------: |:----------------------: |:----------------------------: | +| colocate sync | 32 | 790.10 | 357.41 | 107.71 | 313.81 | 13h 44m | 1d 3h 43m | 2d 9h 22m | 3d 17h 5m | max: 0.3313
last: 0.2448 | +| fully_async_policy | 16:16 | | | \ | | | | | | max:
last: | +| colocate sync | 64 | 365.28 | 150.72 | 70.26 | 133.41 | 10h 22m | 20h 45m | 1d 7h 6m | 1d 17h 32m | max: 0.3365
last: 0.2333 | +| fully_async_policy | 32:32 | 189.26 | 28.46 | \ | 156.98 | 4h 57m
(2.09x) | 10h 14m
(2.03x) | 16h 58m
(1.83x) | 21h 40m
(1.92x) | max: 0.3677
last: 0.3406 | +| colocate sync | 128 | 356.30 | 177.85 | 53.92 | 113.81 | 8h 36m | 17h 56m | 1d 5h 6m | 1d 16h 48m | max: 0.3573
last: 0.2958 | +| fully_async_policy | 64:64 | 150.63 | 33.14 | \ | 113.16 | 3h 13m
(2.67x) | 6h 46m
(2.65x) | 10h 53m
(2.67x) | 17h 22m
(2.35x) | max: 0.3521
last: 0.3094 | > source data: https://wandb.ai/hou-zg-meituan/fully-async-policy?nw=nwuserhouzg @@ -290,22 +290,22 @@ python -m recipe.fully_async_policy.fully_async_main \ 我们使用 Qwen2.5-Math-7B 验证 fully async 所支持的各个模型的效果。 -| training mode | Resource allocation | mode | step | generate_sequences | old_log_prob | update_actor | total time | acc/best@32/mean | -|--------------------|---------------------|------------------------------------------------|------|--------------------|--------------|--------------|------------|------------------| -| fully_async_policy | 64:64 | `stream off policy pipeline` | | | | | | | -| fully_async_policy | 64:64 | `async stream pipeline with staleness samples` | | | | | | | -| fully_async_policy | 64:64 | `async stream pipeline with partial rollout` | | | | | | | +| mode | step | gen | old_log_prob | update_actor | total time
100 step | total time
200 step | total time
300 step | total time
400 step | acc/mean@1 | +|:-------------------------------------------------------------------------------------: |:------: |:------: |:------------: |:------------: |:----------------------: |:----------------------: |:----------------------: |:----------------------: |:---------------------------: | +| `stream off policy pipeline`
(trigger_parameter_sync_step= 4,
require_batches= 4) | 231.34 | 128.47 | \ | 98.77 | 4h 25m | 9h 41m | 15h 2m | 1d 1h 53m | max: 0.2844
last: 0.2604 | +| `async stream pipeline with staleness samples`
(+staleness_threshold=0.5) | | | | | | | | | | +| `async stream pipeline with partial rollout`
(+partial_rollout=True) | 150.63 | 33.14 | \ | 113.16 | 3h 13m | 6h 46m | 10h 53m | 17h 22m | max: 0.3521
last: 0.3094 | ### 128卡 stale 消融实验 在 `async stream pipeline with partial rollout` 模式下,我们验证 staleness 的设置对于训练效率的影响。 -| training mode | Resource allocation | staleness | step | generate_sequences | old_log_prob | update_actor | total time | acc/best@32/mean | -|--------------------|---------------------|-----------|------|--------------------|--------------|--------------|------------|------------------| -| fully_async_policy | 64:64 | 0 | | | | | | | -| fully_async_policy | 64:64 | 0.1 | | | | | | | -| fully_async_policy | 64:64 | 0.3 | | | | | | | -| fully_async_policy | 64:64 | 0.5 | | | | | | | +| staleness_threshold | step | gen | old_log_prob | update_actor | total time
100 step | total time
200 step | total time
300 step | total time
400 step | acc/mean@1 | +|:-------------------: |:------: |:------: |:------------: |:------------: |:----------------------: |:----------------------: |:----------------------: |:----------------------: |:---------------------------: | +| 0 | 231.34 | 128.47 | \ | 98.77 | 4h 25m | 9h 41m | 15h 2m | 1d 1h 53m | max: 0.2844
last: 0.2604 | +| 0.1 | 171.30 | 58.17 | \ | 109.12 | 3h 53m | 8h 37m | 14h 25m | 19h 59m | max: 0.3542
last: 0.2979 | +| 0.3 | 146.11 | 38.88 | \ | 103.22 | 3h 18m | 6h 49m | 11h 40m | 17h 20m | max: 0.3469
last: 0.2865 | +| 0.5 | 150.63 | 33.14 | \ | 113.16 | 3h 13m | 6h 46m | 10h 53m | 17h 22m | max: 0.3521
last: 0.3094 | > source data: https://wandb.ai/hou-zg-meituan/fully-async-policy?nw=nwuserhouzg @@ -313,11 +313,11 @@ python -m recipe.fully_async_policy.fully_async_main \ 在多次测试下,我们发现流式每次下发样本的数量,会影响训练的结果,我们通过修改 `async_training.require_batches` 验证对与结果的影响。 -| training mode | Resource allocation | async_training.require_batches | step | generate_sequences | old_log_prob | update_actor | total time | acc/best@32/mean | -|--------------------|---------------------|--------------------------------|------|--------------------|--------------|--------------|------------|------------------| -| fully_async_policy | 64:64 | 1 | | | | | | | -| fully_async_policy | 64:64 | 2 | | | | | | | -| fully_async_policy | 64:64 | 4 | | | | | | | +| require_batches | step | gen | old_log_prob | update_actor | total time
100 step | total time
200 step | total time
300 step | acc/mean@1 | +|:---------------: |:------: |:-----: |:------------: |:------------: |:----------------------: |:----------------------: |:----------------------: |:---------------------------: | +| 1 | 203.47 | 30.88 | \ | 181.08 | 3h 31m | 8h 29m | 17h 36m | max: 0.349
last: 0.326 | +| 2 | 158.72 | 26.32 | \ | 128.08 | 3h 35m | 7h 38m | 13h 57m | max: 0.351
last: 0.3406 | +| 4 | 124.64 | 25.62 | \ | 95.06 | 3h 13m | 6h 46m | 10h 53m | max: 0.3521
last: 0.3521 | > source data: https://wandb.ai/hou-zg-meituan/fully-async-policy?nw=nwuserhouzg From c4a063374dd2317597a8fd18032bf0f4ae034131 Mon Sep 17 00:00:00 2001 From: ArronHZG Date: Thu, 16 Oct 2025 18:33:57 +0800 Subject: [PATCH 175/182] update readme --- docs/advance/fully_async.md | 426 +++++++++++++++++++++++- recipe/fully_async_policy/README.md | 427 +++++++++++++++++++++++++ recipe/fully_async_policy/README_zh.md | 66 ++-- verl/workers/actor/dp_actor.py | 1 - 4 files changed, 887 insertions(+), 33 deletions(-) create mode 100644 recipe/fully_async_policy/README.md diff --git a/docs/advance/fully_async.md b/docs/advance/fully_async.md index aa9e33ff99c..77498131e45 100644 --- a/docs/advance/fully_async.md +++ b/docs/advance/fully_async.md @@ -1,5 +1,427 @@ # Recipe: Fully Async Policy Async Trainer -**Author:** `https://github.com/meituan-search` +**Author:** `https://github.com/meituan-search` -Last updated: 10/16/2025. \ No newline at end of file +Last updated: 10/16/2025. + +This document introduces a fully asynchronous PPO training system that completely decouples the Trainer and Rollouter, +supporting asynchronous sample generation and training. +Under this system, we achieved a 2.35x-2.67x performance improvement when training the Qwen2.5-7B model with 128 GPUs, +without significantly affecting the results. + +## Introduction + +### Background + +The separated rollout and train architecture, compared to the colocate architecture, can allocate resources more +flexibly and design more flexible training logic, thereby addressing issues such as low GPU utilization and training +efficiency caused by long-tail problems. +The one_step_off_policy alleviates the problem of long rollout times and achieves some gains in training efficiency by +designing a separated architecture and performing asynchronous training between rollout and train for one round. +However, it forcibly uses data from one round of asynchronous training, which is not flexible enough and cannot +completely eliminate the impact of long-tail on training efficiency. +In other frameworks such as AReaL, Magistral, StreamRL, and AsyncFlow, asynchronous training and streaming training have +been implemented based on the separated architecture and have achieved gains. +We借鉴 their methods and implemented them in VERL. The fully_async_policy supports asynchronous, streaming, and partial +rollout training. +By reasonably setting parameters such as resource allocation and parameter synchronization frequency, fully_async_policy +can significantly improve training efficiency. + +> Magistral https://arxiv.org/abs/2506.10910 +> +> AReaL: A Large-Scale Asynchronous Reinforcement Learning System for Language +> Reasoning https://arxiv.org/abs/2505.24298 +> +> StreamRL: Scalable, Heterogeneous, and Elastic RL for LLMs with Disaggregated Stream +> Generation https://arxiv.org/abs/2504.15930 +> +> AsyncFlow: An Asynchronous Streaming RL Framework for Efficient LLM Post-Training https://arxiv.org/abs/2507.01663 +> + +### Core Contributions + +* **Resource Isolation**: Unlike using hybrid_engine, Rollouter and Trainer use separate computing resources and need to + specify the resources they occupy separately. +* **Parallel Generation and Training**: While the Trainer is training, the Rollouter is generating new samples. +* **Multi-step Asynchronous**: Compared to one step off policy, it supports asynchronous settings from 0.x steps to + multiple steps, making the asynchronous solution more flexible. +* **NCCL Parameter Synchronization**: Uses NCCL communication primitives for parameter communication between Rollouter + and Trainer. +* **Stream Inference and Training**: Rollouter generates data sample by sample, and data transmission uses a single + sample as the minimum transmission unit. +* **Asynchronous Training and Freshness Control**: By setting the parameter async_training.staleness_threshold, it + supports training with samples generated by old parameters. +* **PartialRollout**: The Rollouter's inference process supports partial rollout logic. During parameter + synchronization, by adding `sleep() and resume()` logic, it + saves samples from ongoing rollouts and continues using them in the next rollout, reducing the time spent waiting for + ongoing tasks to finish during parameter synchronization. + +Currently, the supported usage mode is fsdp+vllm. vllm must use the server mode based on AgentLoop. + +## Design + +The overall architecture of fully_async_policy is shown in the figure below. fully_async_policy mainly consists of four +parts: Rollouter, MessageQueue, Trainer, and ParameterSynchronizer. + +![fully_async_policy_structure]( +https://github.com/ArronHZG/verl-community/blob/recipe/async_policy/docs/fully_async_policy_structure.svg?raw=true) + +1. Rollouter generates sequences sample by sample and puts the generated samples into the MessageQueue, with the + production speed controlled by freshness. +2. MessageQueue is used to temporarily store samples generated by Rollouter. +3. Trainer fetches samples from MessageQueue sample by sample. After fetching `require_batches*ppo_mini_batch_size` + samples, it will perform training. After training for async_training.trigger_parameter_sync_step rounds, it triggers + a parameter synchronization with Rollouter. +4. ParameterSynchronizer implements the NCCL synchronous parameter synchronization capability. + +The source of benefits compared to the base scheme lies in the fact that in the colocate case, using more resources for +rollout cannot solve the idleness caused by long-tail samples. +After we perform resource isolation, the time for rollout and train may be longer than before (because fewer resources +are used), +but the overlap in their time consumption reduces the end-to-end time consumption. + +![fully_async_policy_revenue]( +https://github.com/ArronHZG/verl-community/blob/recipe/async_policy/docs/fully_async_policy_revenue.svg?raw=true) + +## Usage + +### Parameter Description + +| super params | implication | +|-----------------------------------------------|------------------------------------------------------------------------------------------------| +| `trainer.nnodes` | Number of nodes for Trainer | +| `trainer.n_gpus_per_node` | Number of GPUs per node for Trainer | +| `rollout.nnodes` | Number of nodes for Rollouter | +| `rollout.n_gpus_per_node` | Number of GPUs per node for Rollouter | +| `data.train_batch_size` | In the fully async strategy, this value is not effective (default is 0) | +| `data.gen_batch_size` | In the fully async strategy, uses streaming sample production logic (default is 1) | +| `rollout.total_rollout_steps` | Total number of rollout samples | +| `rollout.test_freq` | How many times Rollouter updates parameters before performing a validation | +| `actor_rollout_ref.actor.ppo_mini_batch_size` | The ppo_mini_batch_size is a global num across all workers/gpus | +| `async_training.require_batches` | Number of ppo_mini_batch_size that FullyAsyncTrainer fetches at once | +| `async_training.trigger_parameter_sync_step` | Indicates how many local updates FullyAsyncTrainer performs before a parameter synchronization | +| `async_training.staleness_threshold` | Freshness control | +| `async_training.partial_rollout` | Whether to perform partial_rollout | +| `async_training.use_rollout_log_probs` | Use log_probs generated by rollout | + +**Further Explanation:** + +* `rollout.total_rollout_steps` + + Compared to colocate, the quantity can be aligned by multiplying train_batch_size and step: + `rollout.total_rollout_steps = data.train_batch_size * step`. + +* `async_training.trigger_parameter_sync_step` + + In the fully async strategy, it indicates how many local updates the Trainer performs (i.e., how many times it fetches + `require_batches * ppo_mini_batch_size` samples) before a parameter synchronization with Rollouter. + Between every two parameter synchronizations between Rollouter and Trainer, the Trainer will process + `trigger_parameter_sync_step* require_batches*ppo_mini_batch_size` samples. + To fairly compare speed with colocate, trigger_parameter_sync_step should be set to + `data.train_batch_size / (require_batches * ppo_mini_batch_size)`. + +* `async_training.staleness_threshold` + + In the fully async strategy, it indicates the maximum proportion of stale samples allowed to be used. + + * staleness_threshold=0, indicates synchronous training. + Rollouter will generate a fixed number of samples between two parameter updates, the sample count is: + $$rollout\_num = (trigger\_parameter\_sync\_step*require\_batches*ppo\_mini\_batch\_size)$$ + * staleness_threshold>0, indicates asynchronous training, can be set to a decimal for more flexible asynchronous + calls. + Rollouter will generate at most the following number of samples between two parameter updates: + $$rollout\_num = (1+staleness\_threshold)*(trigger\_parameter\_sync\_step*require\_batches*ppo\_mini\_batch\_size) - num\_staleness\_sample $$ + + num_staleness_sample represents the number of stale samples generated in excess during the last rollout. + + Since it's a streaming system, rollout continues to generate and trainer continues to consume. If rollouter is slower, + trainer will trigger parameter synchronization earlier, and rollouter will not actually produce rollout_num samples. + When rollout is fast enough, setting staleness_threshold to 1 is basically equivalent to one_step_off policy. + To avoid too many expired samples affecting training accuracy, it is recommended to set this value to less than 1. + +* `async_training.partial_rollout` + + partial_rollout only actually takes effect when staleness_threshold>0. + +* `async_training.use_rollout_log_probs` + + In reinforcement learning algorithms, log_probs have implicit correlations with parameter versions and tokens. Due to + the settings of algorithms like PPO/GRPO/DAPO, when calculating importance sampling, + old_log_prob must use the log_probs corresponding to the rollout parameters and tokens to ensure algorithm + correctness. In the fully + async strategy, we default to old_log_prob being calculated by rollout rather than by trainer. + + * `async_training.require_batches` + + In streaming training, require_batches should be set to 1, indicating that training is performed after producing + enough ppo_mini_batch_size samples. + In actual testing, we found that if fewer samples are issued at once, due to the order of data distribution, it can + cause training instability and longer response lengths. + Here, we additionally provide require_batches for streaming distribution and control the number of samples + participating in training at once. + +### Supported Modes + +1. on policy pipeline: + 1. **trigger_parameter_sync_step=1, staleness_threshold=0** + 2. Rollouter produces `require_batches*ppo_mini_batch_size` samples at once, Trainer fetches these samples for + training, and after training completes, Trainer and Rollouter perform a parameter synchronization; + 3. During the rollout phase, if there are long-tail samples but few rollout samples, shorter samples cannot fill + idle resources, causing some resource waste. + 4. As shown in figure a; + +2. stream off policy pipeline: + 1. **trigger_parameter_sync_step>1, staleness_threshold=0** + 2. Synchronous streaming training will be performed. Rollouter produces + `require_batches*ppo_mini_batch_size*trigger_parameter_sync_step` samples at once, Trainer performs a local + training every time it fetches `require_batches*ppo_mini_batch_size` samples, and after training + trigger_parameter_sync_step times, Trainer and Rollouter perform a parameter synchronization; + 3. Compared to a, since more samples are generated at once, resource idleness will be lower. + 4. In one step training, there will be two periods of resource idleness: when fetching the first batch of samples, + train waits for `require_batches*ppo_mini_batch_size` samples to be produced, and during the last parameter + update, rollout waits for training to complete. + 5. As shown in figure b; + +3. async stream pipeline with stale samples: + 1. **trigger_parameter_sync_step>=1, staleness_threshold>0, partial_rollout=False** + 2. After each parameter update, Rollouter will plan to produce at most rollout_num samples (in practice, the number + of samples generated may be less than this value depending on rollout speed). + 3. If the rollout process is relatively fast, Rollouter will generate some additional samples num_stale_samples + before parameter synchronization for immediate use by Trainer after synchronization. + When triggering parameter synchronization, if Rollouter has ongoing tasks, it will wait for the tasks to complete + and not add new tasks; + 4. Compared to b, except for the first step training, subsequent training will not have the time to wait for the + first batch rollout to finish, but will have the time to wait for active tasks to finish. + 5. As shown in figure c; + +4. async stream pipeline with partial rollout: + 1. **trigger_parameter_sync_step>=1, staleness_threshold>0, partial_rollout=True** + 2. Compared to c, when triggering parameter synchronization, if Rollouter has samples being produced, it will + interrupt the rollout process and perform parameter synchronization. The interrupted samples will continue to be + generated after synchronization. This reduces the time to wait for active tasks to finish. + 3. As shown in figure d; + +![fully_async_policy_mode]( +https://github.com/ArronHZG/verl-community/blob/recipe/async_policy/docs/fully_async_policy_mode.svg?raw=true) + +### Key Metrics + +| metrics | implication | +|------------------------------------------------|--------------------------------------------------------------------------------------------------------| +| `trainer/idle_ratio` | Trainer idle rate | +| `rollouter/idle_ratio` | Rollouter idle rate | +| `fully_async/count/stale_samples_processed` | Total number of old samples used in training | +| `fully_async/count/stale_trajectory_processed` | Total number of old trajectories used in training (one sample produces rollout.n trajectories) | +| `fully_async/partial/total_partial_num` | Number of partial samples processed by Trainer between two trigger_parameter_sync_step | +| `fully_async/partial/partial_ratio` | Ratio of partial samples processed by Trainer between two trigger_parameter_sync_step | +| `fully_async/partial/max_partial_span` | Maximum parameter span of partial samples processed by Trainer between two trigger_parameter_sync_step | + +### Parameter Tuning Recommendations + +* Resource Allocation and Adjustment: + * Reasonable resource allocation is the prerequisite for achieving good training efficiency. The ideal resource + allocation should make the rollout time and train time close, thereby minimizing pipeline bubbles in the entire + training process, + avoiding resource idleness, and ensuring Trainer does not use old samples. In real training scenarios, resource + allocation can be adjusted based on the idle time of rollout and train during actual training, + which can be obtained from rollouter/idle_ratio and trainer/idle_ratio. If rollouter/idle_ratio is high and + trainer/idle_ratio is low, + Trainer resources should be increased and Rollouter resources should be reduced, and vice versa. + +* Key Parameters: + * staleness_threshold: Setting it too high will cause more old samples to be used, affecting model performance. It + is recommended to set it to less than 1. + * require_batches: The closer to 1, the closer to a pure streaming process, the smaller the training bubbles, and + the faster the acceleration effect that can be achieved in terms of speed, but it will affect the order of sample + processing; + * trigger_parameter_sync_step: The smaller the setting, the closer to on policy, but it will cause frequent + parameter synchronization. Long-tail samples waste resources that cannot be filled by short samples, resulting in + low resource utilization. + The larger the setting, the higher the computational efficiency, but the accuracy will be affected by off policy. + * rollout.test_freq: It will occupy Rollouter resources and is not recommended to be set too small. + +* Mode Selection: By adjusting different parameters, the Fully Async architecture supports optimization acceleration at + different levels, suitable for tasks in different scenarios. + * For small-scale tasks that need to ensure training stability and on-policy nature, and have low speed + requirements, the on policy pipeline mode (Mode 1) can be tried. + * For scenarios that need to improve training throughput but are sensitive to staleness, the stream off policy + pipeline mode can be tried. That is, by + setting trigger_parameter_sync_step>1 to improve training efficiency, but still maintaining the synchronization + mechanism (staleness_threshold=0) (Mode 2). + * For large-scale tasks with high training speed requirements and can tolerate a certain degree of off-policy and + staleness, setting staleness_threshold> + 0 and partial_rollout=True can improve training efficiency, using the async stream pipeline mode (Mode 3 or 4). + +### Quick Start + +```shell +rollout_mode="async" +rollout_name="vllm" # sglang or vllm +if [ "$rollout_mode" = "async" ]; then + export VLLM_USE_V1=1 + return_raw_chat="True" +fi + +train_prompt_bsz=0 +gen_prompt_bsz=1 +n_resp_per_prompt=16 +train_prompt_mini_bsz=32 +total_rollout_steps=$(((512*400))) +test_freq=10 +staleness_threshold=0 +trigger_parameter_sync_step=16 +partial_rollout=False + + +python -m recipe.fully_async_policy.fully_async_main \ + train_batch_size=${train_prompt_bsz} \ + data.gen_batch_size=${gen_prompt_bsz} \ + data.return_raw_chat=${return_raw_chat} \ + actor_rollout_ref.rollout.n=${n_resp_per_prompt} \ + actor_rollout_ref.actor.strategy=fsdp2 \ + critic.strategy=fsdp2 \ + actor_rollout_ref.hybrid_engine=False \ + actor_rollout_ref.actor.use_dynamic_bsz=${use_dynamic_bsz} \ + actor_rollout_ref.ref.log_prob_use_dynamic_bsz=${use_dynamic_bsz} \ + actor_rollout_ref.rollout.log_prob_use_dynamic_bsz=${use_dynamic_bsz} \ + actor_rollout_ref.rollout.name=${rollout_name} \ + actor_rollout_ref.rollout.mode=${rollout_mode} \ + actor_rollout_ref.rollout.calculate_log_probs=True \ + trainer.nnodes="${NNODES_TRAIN}" \ + trainer.n_gpus_per_node="${NGPUS_PER_NODE}" \ + rollout.nnodes="${NNODES_ROLLOUT}" \ + rollout.n_gpus_per_node="${NGPUS_PER_NODE}" \ + rollout.total_rollout_steps="${total_rollout_steps}" \ + rollout.test_freq="${test_freq}" \ + async_training.staleness_threshold="${staleness_threshold}" \ + async_training.trigger_parameter_sync_step="${trigger_parameter_sync_step}" \ + async_training.partial_rollout="${partial_rollout}" +``` + +## Experiments + +### Asynchronous Training on 7B Model + +We used Qwen2.5-Math-7B to verify the benefits of the fully async strategy under long candidates and multiple resources. +Using the `async stream pipeline with stale samples` strategy, we achieved about 2x performance improvement on 32 cards, +64 cards, and 128 cards without significantly affecting experimental results. + +* Machine: H20 +* Model: Qwen2.5-Math-7B +* Rollout length: max_response_length FSDP2: 28K tokens; +* Algorithm: DAPO +* Dataset: TRAIN_FILE: dapo-math-17k.parquet TEST_FILE: aime-2024.parquet +* Engine: vllm+FSDP2 +* rollout.n: 16 +* ppo_mini_batch_size: 32 +* test_freq: 20 + +* colocate sync: + * step: 400 + * train_batch_size: 512 + +* fully_async_policy + * total_rollout_steps: 512*400 + * require_batches: 4 + * trigger_parameter_sync_step: 4 + * staleness_threshold: 0.3 + * partial_rollout: True + +| training mode | resource allocation | step | gen | old_log_prob | update_actor | total time
100 step | total time
200 step | total time
300 step | total time
400 step | acc/mean@1 | +|:--------------------:|:---------------------:|:--------:|:--------:|:--------------:|:--------------:|:------------------------:|:------------------------:|:------------------------:|:------------------------:|:------------------------------:| +| colocate sync | 32 | 790.10 | 357.41 | 107.71 | 313.81 | 13h 44m | 1d 3h 43m | 2d 9h 22m | 3d 17h 5m | max: 0.3313
last: 0.2448 | +| fully_async_policy | 16:16 | | | \ | | | | | | max:
last: | +| colocate sync | 64 | 365.28 | 150.72 | 70.26 | 133.41 | 10h 22m | 20h 45m | 1d 7h 6m | 1d 17h 32m | max: 0.3365
last: 0.2333 | +| fully_async_policy | 32:32 | 189.26 | 28.46 | \ | 156.98 | 4h 57m
(2.09x) | 10h 14m
(2.03x) | 16h 58m
(1.83x) | 21h 40m
(1.92x) | max: 0.3677
last: 0.3406 | +| colocate sync | 128 | 356.30 | 177.85 | 53.92 | 113.81 | 8h 36m | 17h 56m | 1d 5h 6m | 1d 16h 48m | max: 0.3573
last: 0.2958 | +| fully_async_policy | 64:64 | 150.63 | 33.14 | \ | 113.16 | 3h 13m
(2.67x) | 6h 46m
(2.65x) | 10h 53m
(2.67x) | 17h 22m
(2.35x) | max: 0.3521
last: 0.3094 | + +> source data: https://wandb.ai/hou-zg-meituan/fully-async-policy?nw=nwuserhouzg + +### 128-card 7B Asynchronous Mode Experiment + +We used Qwen2.5-Math-7B to verify the effects of various modes supported by fully async. +We can see that the benefit brought by streaming is approximately 0.6x, and after combining staleness and +partial_rollout, the benefit reaches 2.35x. + +| mode | step | gen | old_log_prob | update_actor | total time
100 step | total time
200 step | total time
300 step | total time
400 step | acc/mean@1 | +|:-------------------------------------------------------------------------------------------------------:|:---------------------:|:--------:|:--------------:|:--------------:|:------------------------:|:------------------------:|:------------------------:|:------------------------:|:-----------------------------:| +| colocate sync | 128 | 356.30 | 177.85 | 53.92 | 113.81 | 8h 36m | 17h 56m | 1d 5h 6m | 1d 16h 48m | max: 0.3573
last: 0.2958 | +| `stream off policy pipeline`
(+fully async: trigger_parameter_sync_step= 4,
require_batches= 4) | 231.34 | 128.47 | \ | 98.77 | 4h 25m | 9h 41m | 15h 2m | 1d 1h 53m | max: 0.2844
last: 0.2604 | +| `async stream pipeline with stale samples`
(+staleness_threshold=0.5) | | | | | | | | | | +| `async stream pipeline with partial rollout`
(+partial_rollout=True) | 150.63 | 33.14 | \ | 113.16 | 3h 13m | 6h 46m | 10h 53m | 17h 22m | max: 0.3521
last: 0.3094 | + +### 128-card Stale Ablation Experiment + +Under the `async stream pipeline with partial rollout` mode, we verified the impact of staleness settings on training +efficiency. +We found that the larger the staleness, the more obvious the final gains. +We also noticed that the times for staleness values of 0.3 and 0.5 are quite close, because as the training steps +increase, the response length changes significantly, causing training instability. +Further analysis and optimization are needed for this issue. + +| staleness_threshold | step | gen | old_log_prob | update_actor | total time
100 step | total time
200 step | total time
300 step | total time
400 step | acc/mean@1 | +|:---------------------:|:--------:|:--------:|:--------------:|:--------------:|:------------------------:|:------------------------:|:------------------------:|:------------------------:|:-----------------------------:| +| 0 | 231.34 | 128.47 | \ | 98.77 | 4h 25m | 9h 41m | 15h 2m | 1d 1h 53m | max: 0.2844
last: 0.2604 | +| 0.1 | 171.30 | 58.17 | \ | 109.12 | 3h 53m | 8h 37m | 14h 25m | 19h 59m | max: 0.3542
last: 0.2979 | +| 0.3 | 146.11 | 38.88 | \ | 103.22 | 3h 18m | 6h 49m | 11h 40m | 17h 20m | max: 0.3469
last: 0.2865 | +| 0.5 | 150.63 | 33.14 | \ | 113.16 | 3h 13m | 6h 46m | 10h 53m | 17h 22m | max: 0.3521
last: 0.3094 | + +> source data: https://wandb.ai/hou-zg-meituan/fully-async-policy?nw=nwuserhouzg + +### 128-card 7B require_batches Ablation Experiment + +In multiple tests, we found that the number of samples issued each time in streaming affects the response length during +training, which in turn affects training time. We verified the impact on results by modifying +`async_training.require_batches`. + +| require_batches | step | gen | old_log_prob | update_actor | total time
100 step | total time
200 step | total time
300 step | acc/mean@1 | +|:-----------------:|:--------:|:-------:|:--------------:|:--------------:|:------------------------:|:------------------------:|:------------------------:|:-----------------------------:| +| 1 | 203.47 | 30.88 | \ | 181.08 | 3h 31m | 8h 29m | 17h 36m | max: 0.349
last: 0.326 | +| 2 | 158.72 | 26.32 | \ | 128.08 | 3h 35m | 7h 38m | 13h 57m | max: 0.351
last: 0.3406 | +| 4 | 124.64 | 25.62 | \ | 95.06 | 3h 13m | 6h 46m | 10h 53m | max: 0.3521
last: 0.3521 | + +> source data: https://wandb.ai/hou-zg-meituan/fully-async-policy?nw=nwuserhouzg + +### 30B Model Mode Experiment + +TODO: The 30B experiment is still in progress. + +* Machine: H20 +* Model: Qwen2.5-32B +* Rollout length: max_response_length FSDP2: 20K tokens; +* Algorithm: DAPO +* Engine: vllm+FSDP2 +* rollout.n: 16 +* ppo_mini_batch_size: 32 +* test_freq: 20 + +* colocate sync: + * step:200 + * train_batch_size: 512 + +* fully_async_policy + * total_rollout_steps: 512*200 + * trigger_parameter_sync_step: 512/32 = 16 + * staleness_threshold: 0 + * partial_rollout: False + +| training mode | Resource allocation | mode | step | generate_sequences | old_log_prob | update_actor | total time | acc/best@32/mean | +|--------------------|---------------------|--------------------------------------------|------|--------------------|--------------|--------------|------------|------------------| +| colocate sync | 128 | | | | | | | | +| fully_async_policy | 64:64 | stream off policy pipeline | | | | | | | +| fully_async_policy | 64:64 | async stream pipeline with stale samples | | | | | | | +| fully_async_policy | 64:64 | async stream pipeline with partial rollout | | | | | | | + +> source data: https://wandb.ai/hou-zg-meituan/fully-async-policy?nw=nwuserhouzg + +## Future Plans + +* GRPO experiments +* Megatron adaptation +* SGLang integration +* Transfer queue integration +* Asynchronous parameter synchronization +* AReaL asynchronous algorithm implementation +* TPPO algorithm implementation +* Multi-turn and Tool support \ No newline at end of file diff --git a/recipe/fully_async_policy/README.md b/recipe/fully_async_policy/README.md new file mode 100644 index 00000000000..77498131e45 --- /dev/null +++ b/recipe/fully_async_policy/README.md @@ -0,0 +1,427 @@ +# Recipe: Fully Async Policy Async Trainer + +**Author:** `https://github.com/meituan-search` + +Last updated: 10/16/2025. + +This document introduces a fully asynchronous PPO training system that completely decouples the Trainer and Rollouter, +supporting asynchronous sample generation and training. +Under this system, we achieved a 2.35x-2.67x performance improvement when training the Qwen2.5-7B model with 128 GPUs, +without significantly affecting the results. + +## Introduction + +### Background + +The separated rollout and train architecture, compared to the colocate architecture, can allocate resources more +flexibly and design more flexible training logic, thereby addressing issues such as low GPU utilization and training +efficiency caused by long-tail problems. +The one_step_off_policy alleviates the problem of long rollout times and achieves some gains in training efficiency by +designing a separated architecture and performing asynchronous training between rollout and train for one round. +However, it forcibly uses data from one round of asynchronous training, which is not flexible enough and cannot +completely eliminate the impact of long-tail on training efficiency. +In other frameworks such as AReaL, Magistral, StreamRL, and AsyncFlow, asynchronous training and streaming training have +been implemented based on the separated architecture and have achieved gains. +We借鉴 their methods and implemented them in VERL. The fully_async_policy supports asynchronous, streaming, and partial +rollout training. +By reasonably setting parameters such as resource allocation and parameter synchronization frequency, fully_async_policy +can significantly improve training efficiency. + +> Magistral https://arxiv.org/abs/2506.10910 +> +> AReaL: A Large-Scale Asynchronous Reinforcement Learning System for Language +> Reasoning https://arxiv.org/abs/2505.24298 +> +> StreamRL: Scalable, Heterogeneous, and Elastic RL for LLMs with Disaggregated Stream +> Generation https://arxiv.org/abs/2504.15930 +> +> AsyncFlow: An Asynchronous Streaming RL Framework for Efficient LLM Post-Training https://arxiv.org/abs/2507.01663 +> + +### Core Contributions + +* **Resource Isolation**: Unlike using hybrid_engine, Rollouter and Trainer use separate computing resources and need to + specify the resources they occupy separately. +* **Parallel Generation and Training**: While the Trainer is training, the Rollouter is generating new samples. +* **Multi-step Asynchronous**: Compared to one step off policy, it supports asynchronous settings from 0.x steps to + multiple steps, making the asynchronous solution more flexible. +* **NCCL Parameter Synchronization**: Uses NCCL communication primitives for parameter communication between Rollouter + and Trainer. +* **Stream Inference and Training**: Rollouter generates data sample by sample, and data transmission uses a single + sample as the minimum transmission unit. +* **Asynchronous Training and Freshness Control**: By setting the parameter async_training.staleness_threshold, it + supports training with samples generated by old parameters. +* **PartialRollout**: The Rollouter's inference process supports partial rollout logic. During parameter + synchronization, by adding `sleep() and resume()` logic, it + saves samples from ongoing rollouts and continues using them in the next rollout, reducing the time spent waiting for + ongoing tasks to finish during parameter synchronization. + +Currently, the supported usage mode is fsdp+vllm. vllm must use the server mode based on AgentLoop. + +## Design + +The overall architecture of fully_async_policy is shown in the figure below. fully_async_policy mainly consists of four +parts: Rollouter, MessageQueue, Trainer, and ParameterSynchronizer. + +![fully_async_policy_structure]( +https://github.com/ArronHZG/verl-community/blob/recipe/async_policy/docs/fully_async_policy_structure.svg?raw=true) + +1. Rollouter generates sequences sample by sample and puts the generated samples into the MessageQueue, with the + production speed controlled by freshness. +2. MessageQueue is used to temporarily store samples generated by Rollouter. +3. Trainer fetches samples from MessageQueue sample by sample. After fetching `require_batches*ppo_mini_batch_size` + samples, it will perform training. After training for async_training.trigger_parameter_sync_step rounds, it triggers + a parameter synchronization with Rollouter. +4. ParameterSynchronizer implements the NCCL synchronous parameter synchronization capability. + +The source of benefits compared to the base scheme lies in the fact that in the colocate case, using more resources for +rollout cannot solve the idleness caused by long-tail samples. +After we perform resource isolation, the time for rollout and train may be longer than before (because fewer resources +are used), +but the overlap in their time consumption reduces the end-to-end time consumption. + +![fully_async_policy_revenue]( +https://github.com/ArronHZG/verl-community/blob/recipe/async_policy/docs/fully_async_policy_revenue.svg?raw=true) + +## Usage + +### Parameter Description + +| super params | implication | +|-----------------------------------------------|------------------------------------------------------------------------------------------------| +| `trainer.nnodes` | Number of nodes for Trainer | +| `trainer.n_gpus_per_node` | Number of GPUs per node for Trainer | +| `rollout.nnodes` | Number of nodes for Rollouter | +| `rollout.n_gpus_per_node` | Number of GPUs per node for Rollouter | +| `data.train_batch_size` | In the fully async strategy, this value is not effective (default is 0) | +| `data.gen_batch_size` | In the fully async strategy, uses streaming sample production logic (default is 1) | +| `rollout.total_rollout_steps` | Total number of rollout samples | +| `rollout.test_freq` | How many times Rollouter updates parameters before performing a validation | +| `actor_rollout_ref.actor.ppo_mini_batch_size` | The ppo_mini_batch_size is a global num across all workers/gpus | +| `async_training.require_batches` | Number of ppo_mini_batch_size that FullyAsyncTrainer fetches at once | +| `async_training.trigger_parameter_sync_step` | Indicates how many local updates FullyAsyncTrainer performs before a parameter synchronization | +| `async_training.staleness_threshold` | Freshness control | +| `async_training.partial_rollout` | Whether to perform partial_rollout | +| `async_training.use_rollout_log_probs` | Use log_probs generated by rollout | + +**Further Explanation:** + +* `rollout.total_rollout_steps` + + Compared to colocate, the quantity can be aligned by multiplying train_batch_size and step: + `rollout.total_rollout_steps = data.train_batch_size * step`. + +* `async_training.trigger_parameter_sync_step` + + In the fully async strategy, it indicates how many local updates the Trainer performs (i.e., how many times it fetches + `require_batches * ppo_mini_batch_size` samples) before a parameter synchronization with Rollouter. + Between every two parameter synchronizations between Rollouter and Trainer, the Trainer will process + `trigger_parameter_sync_step* require_batches*ppo_mini_batch_size` samples. + To fairly compare speed with colocate, trigger_parameter_sync_step should be set to + `data.train_batch_size / (require_batches * ppo_mini_batch_size)`. + +* `async_training.staleness_threshold` + + In the fully async strategy, it indicates the maximum proportion of stale samples allowed to be used. + + * staleness_threshold=0, indicates synchronous training. + Rollouter will generate a fixed number of samples between two parameter updates, the sample count is: + $$rollout\_num = (trigger\_parameter\_sync\_step*require\_batches*ppo\_mini\_batch\_size)$$ + * staleness_threshold>0, indicates asynchronous training, can be set to a decimal for more flexible asynchronous + calls. + Rollouter will generate at most the following number of samples between two parameter updates: + $$rollout\_num = (1+staleness\_threshold)*(trigger\_parameter\_sync\_step*require\_batches*ppo\_mini\_batch\_size) - num\_staleness\_sample $$ + + num_staleness_sample represents the number of stale samples generated in excess during the last rollout. + + Since it's a streaming system, rollout continues to generate and trainer continues to consume. If rollouter is slower, + trainer will trigger parameter synchronization earlier, and rollouter will not actually produce rollout_num samples. + When rollout is fast enough, setting staleness_threshold to 1 is basically equivalent to one_step_off policy. + To avoid too many expired samples affecting training accuracy, it is recommended to set this value to less than 1. + +* `async_training.partial_rollout` + + partial_rollout only actually takes effect when staleness_threshold>0. + +* `async_training.use_rollout_log_probs` + + In reinforcement learning algorithms, log_probs have implicit correlations with parameter versions and tokens. Due to + the settings of algorithms like PPO/GRPO/DAPO, when calculating importance sampling, + old_log_prob must use the log_probs corresponding to the rollout parameters and tokens to ensure algorithm + correctness. In the fully + async strategy, we default to old_log_prob being calculated by rollout rather than by trainer. + + * `async_training.require_batches` + + In streaming training, require_batches should be set to 1, indicating that training is performed after producing + enough ppo_mini_batch_size samples. + In actual testing, we found that if fewer samples are issued at once, due to the order of data distribution, it can + cause training instability and longer response lengths. + Here, we additionally provide require_batches for streaming distribution and control the number of samples + participating in training at once. + +### Supported Modes + +1. on policy pipeline: + 1. **trigger_parameter_sync_step=1, staleness_threshold=0** + 2. Rollouter produces `require_batches*ppo_mini_batch_size` samples at once, Trainer fetches these samples for + training, and after training completes, Trainer and Rollouter perform a parameter synchronization; + 3. During the rollout phase, if there are long-tail samples but few rollout samples, shorter samples cannot fill + idle resources, causing some resource waste. + 4. As shown in figure a; + +2. stream off policy pipeline: + 1. **trigger_parameter_sync_step>1, staleness_threshold=0** + 2. Synchronous streaming training will be performed. Rollouter produces + `require_batches*ppo_mini_batch_size*trigger_parameter_sync_step` samples at once, Trainer performs a local + training every time it fetches `require_batches*ppo_mini_batch_size` samples, and after training + trigger_parameter_sync_step times, Trainer and Rollouter perform a parameter synchronization; + 3. Compared to a, since more samples are generated at once, resource idleness will be lower. + 4. In one step training, there will be two periods of resource idleness: when fetching the first batch of samples, + train waits for `require_batches*ppo_mini_batch_size` samples to be produced, and during the last parameter + update, rollout waits for training to complete. + 5. As shown in figure b; + +3. async stream pipeline with stale samples: + 1. **trigger_parameter_sync_step>=1, staleness_threshold>0, partial_rollout=False** + 2. After each parameter update, Rollouter will plan to produce at most rollout_num samples (in practice, the number + of samples generated may be less than this value depending on rollout speed). + 3. If the rollout process is relatively fast, Rollouter will generate some additional samples num_stale_samples + before parameter synchronization for immediate use by Trainer after synchronization. + When triggering parameter synchronization, if Rollouter has ongoing tasks, it will wait for the tasks to complete + and not add new tasks; + 4. Compared to b, except for the first step training, subsequent training will not have the time to wait for the + first batch rollout to finish, but will have the time to wait for active tasks to finish. + 5. As shown in figure c; + +4. async stream pipeline with partial rollout: + 1. **trigger_parameter_sync_step>=1, staleness_threshold>0, partial_rollout=True** + 2. Compared to c, when triggering parameter synchronization, if Rollouter has samples being produced, it will + interrupt the rollout process and perform parameter synchronization. The interrupted samples will continue to be + generated after synchronization. This reduces the time to wait for active tasks to finish. + 3. As shown in figure d; + +![fully_async_policy_mode]( +https://github.com/ArronHZG/verl-community/blob/recipe/async_policy/docs/fully_async_policy_mode.svg?raw=true) + +### Key Metrics + +| metrics | implication | +|------------------------------------------------|--------------------------------------------------------------------------------------------------------| +| `trainer/idle_ratio` | Trainer idle rate | +| `rollouter/idle_ratio` | Rollouter idle rate | +| `fully_async/count/stale_samples_processed` | Total number of old samples used in training | +| `fully_async/count/stale_trajectory_processed` | Total number of old trajectories used in training (one sample produces rollout.n trajectories) | +| `fully_async/partial/total_partial_num` | Number of partial samples processed by Trainer between two trigger_parameter_sync_step | +| `fully_async/partial/partial_ratio` | Ratio of partial samples processed by Trainer between two trigger_parameter_sync_step | +| `fully_async/partial/max_partial_span` | Maximum parameter span of partial samples processed by Trainer between two trigger_parameter_sync_step | + +### Parameter Tuning Recommendations + +* Resource Allocation and Adjustment: + * Reasonable resource allocation is the prerequisite for achieving good training efficiency. The ideal resource + allocation should make the rollout time and train time close, thereby minimizing pipeline bubbles in the entire + training process, + avoiding resource idleness, and ensuring Trainer does not use old samples. In real training scenarios, resource + allocation can be adjusted based on the idle time of rollout and train during actual training, + which can be obtained from rollouter/idle_ratio and trainer/idle_ratio. If rollouter/idle_ratio is high and + trainer/idle_ratio is low, + Trainer resources should be increased and Rollouter resources should be reduced, and vice versa. + +* Key Parameters: + * staleness_threshold: Setting it too high will cause more old samples to be used, affecting model performance. It + is recommended to set it to less than 1. + * require_batches: The closer to 1, the closer to a pure streaming process, the smaller the training bubbles, and + the faster the acceleration effect that can be achieved in terms of speed, but it will affect the order of sample + processing; + * trigger_parameter_sync_step: The smaller the setting, the closer to on policy, but it will cause frequent + parameter synchronization. Long-tail samples waste resources that cannot be filled by short samples, resulting in + low resource utilization. + The larger the setting, the higher the computational efficiency, but the accuracy will be affected by off policy. + * rollout.test_freq: It will occupy Rollouter resources and is not recommended to be set too small. + +* Mode Selection: By adjusting different parameters, the Fully Async architecture supports optimization acceleration at + different levels, suitable for tasks in different scenarios. + * For small-scale tasks that need to ensure training stability and on-policy nature, and have low speed + requirements, the on policy pipeline mode (Mode 1) can be tried. + * For scenarios that need to improve training throughput but are sensitive to staleness, the stream off policy + pipeline mode can be tried. That is, by + setting trigger_parameter_sync_step>1 to improve training efficiency, but still maintaining the synchronization + mechanism (staleness_threshold=0) (Mode 2). + * For large-scale tasks with high training speed requirements and can tolerate a certain degree of off-policy and + staleness, setting staleness_threshold> + 0 and partial_rollout=True can improve training efficiency, using the async stream pipeline mode (Mode 3 or 4). + +### Quick Start + +```shell +rollout_mode="async" +rollout_name="vllm" # sglang or vllm +if [ "$rollout_mode" = "async" ]; then + export VLLM_USE_V1=1 + return_raw_chat="True" +fi + +train_prompt_bsz=0 +gen_prompt_bsz=1 +n_resp_per_prompt=16 +train_prompt_mini_bsz=32 +total_rollout_steps=$(((512*400))) +test_freq=10 +staleness_threshold=0 +trigger_parameter_sync_step=16 +partial_rollout=False + + +python -m recipe.fully_async_policy.fully_async_main \ + train_batch_size=${train_prompt_bsz} \ + data.gen_batch_size=${gen_prompt_bsz} \ + data.return_raw_chat=${return_raw_chat} \ + actor_rollout_ref.rollout.n=${n_resp_per_prompt} \ + actor_rollout_ref.actor.strategy=fsdp2 \ + critic.strategy=fsdp2 \ + actor_rollout_ref.hybrid_engine=False \ + actor_rollout_ref.actor.use_dynamic_bsz=${use_dynamic_bsz} \ + actor_rollout_ref.ref.log_prob_use_dynamic_bsz=${use_dynamic_bsz} \ + actor_rollout_ref.rollout.log_prob_use_dynamic_bsz=${use_dynamic_bsz} \ + actor_rollout_ref.rollout.name=${rollout_name} \ + actor_rollout_ref.rollout.mode=${rollout_mode} \ + actor_rollout_ref.rollout.calculate_log_probs=True \ + trainer.nnodes="${NNODES_TRAIN}" \ + trainer.n_gpus_per_node="${NGPUS_PER_NODE}" \ + rollout.nnodes="${NNODES_ROLLOUT}" \ + rollout.n_gpus_per_node="${NGPUS_PER_NODE}" \ + rollout.total_rollout_steps="${total_rollout_steps}" \ + rollout.test_freq="${test_freq}" \ + async_training.staleness_threshold="${staleness_threshold}" \ + async_training.trigger_parameter_sync_step="${trigger_parameter_sync_step}" \ + async_training.partial_rollout="${partial_rollout}" +``` + +## Experiments + +### Asynchronous Training on 7B Model + +We used Qwen2.5-Math-7B to verify the benefits of the fully async strategy under long candidates and multiple resources. +Using the `async stream pipeline with stale samples` strategy, we achieved about 2x performance improvement on 32 cards, +64 cards, and 128 cards without significantly affecting experimental results. + +* Machine: H20 +* Model: Qwen2.5-Math-7B +* Rollout length: max_response_length FSDP2: 28K tokens; +* Algorithm: DAPO +* Dataset: TRAIN_FILE: dapo-math-17k.parquet TEST_FILE: aime-2024.parquet +* Engine: vllm+FSDP2 +* rollout.n: 16 +* ppo_mini_batch_size: 32 +* test_freq: 20 + +* colocate sync: + * step: 400 + * train_batch_size: 512 + +* fully_async_policy + * total_rollout_steps: 512*400 + * require_batches: 4 + * trigger_parameter_sync_step: 4 + * staleness_threshold: 0.3 + * partial_rollout: True + +| training mode | resource allocation | step | gen | old_log_prob | update_actor | total time
100 step | total time
200 step | total time
300 step | total time
400 step | acc/mean@1 | +|:--------------------:|:---------------------:|:--------:|:--------:|:--------------:|:--------------:|:------------------------:|:------------------------:|:------------------------:|:------------------------:|:------------------------------:| +| colocate sync | 32 | 790.10 | 357.41 | 107.71 | 313.81 | 13h 44m | 1d 3h 43m | 2d 9h 22m | 3d 17h 5m | max: 0.3313
last: 0.2448 | +| fully_async_policy | 16:16 | | | \ | | | | | | max:
last: | +| colocate sync | 64 | 365.28 | 150.72 | 70.26 | 133.41 | 10h 22m | 20h 45m | 1d 7h 6m | 1d 17h 32m | max: 0.3365
last: 0.2333 | +| fully_async_policy | 32:32 | 189.26 | 28.46 | \ | 156.98 | 4h 57m
(2.09x) | 10h 14m
(2.03x) | 16h 58m
(1.83x) | 21h 40m
(1.92x) | max: 0.3677
last: 0.3406 | +| colocate sync | 128 | 356.30 | 177.85 | 53.92 | 113.81 | 8h 36m | 17h 56m | 1d 5h 6m | 1d 16h 48m | max: 0.3573
last: 0.2958 | +| fully_async_policy | 64:64 | 150.63 | 33.14 | \ | 113.16 | 3h 13m
(2.67x) | 6h 46m
(2.65x) | 10h 53m
(2.67x) | 17h 22m
(2.35x) | max: 0.3521
last: 0.3094 | + +> source data: https://wandb.ai/hou-zg-meituan/fully-async-policy?nw=nwuserhouzg + +### 128-card 7B Asynchronous Mode Experiment + +We used Qwen2.5-Math-7B to verify the effects of various modes supported by fully async. +We can see that the benefit brought by streaming is approximately 0.6x, and after combining staleness and +partial_rollout, the benefit reaches 2.35x. + +| mode | step | gen | old_log_prob | update_actor | total time
100 step | total time
200 step | total time
300 step | total time
400 step | acc/mean@1 | +|:-------------------------------------------------------------------------------------------------------:|:---------------------:|:--------:|:--------------:|:--------------:|:------------------------:|:------------------------:|:------------------------:|:------------------------:|:-----------------------------:| +| colocate sync | 128 | 356.30 | 177.85 | 53.92 | 113.81 | 8h 36m | 17h 56m | 1d 5h 6m | 1d 16h 48m | max: 0.3573
last: 0.2958 | +| `stream off policy pipeline`
(+fully async: trigger_parameter_sync_step= 4,
require_batches= 4) | 231.34 | 128.47 | \ | 98.77 | 4h 25m | 9h 41m | 15h 2m | 1d 1h 53m | max: 0.2844
last: 0.2604 | +| `async stream pipeline with stale samples`
(+staleness_threshold=0.5) | | | | | | | | | | +| `async stream pipeline with partial rollout`
(+partial_rollout=True) | 150.63 | 33.14 | \ | 113.16 | 3h 13m | 6h 46m | 10h 53m | 17h 22m | max: 0.3521
last: 0.3094 | + +### 128-card Stale Ablation Experiment + +Under the `async stream pipeline with partial rollout` mode, we verified the impact of staleness settings on training +efficiency. +We found that the larger the staleness, the more obvious the final gains. +We also noticed that the times for staleness values of 0.3 and 0.5 are quite close, because as the training steps +increase, the response length changes significantly, causing training instability. +Further analysis and optimization are needed for this issue. + +| staleness_threshold | step | gen | old_log_prob | update_actor | total time
100 step | total time
200 step | total time
300 step | total time
400 step | acc/mean@1 | +|:---------------------:|:--------:|:--------:|:--------------:|:--------------:|:------------------------:|:------------------------:|:------------------------:|:------------------------:|:-----------------------------:| +| 0 | 231.34 | 128.47 | \ | 98.77 | 4h 25m | 9h 41m | 15h 2m | 1d 1h 53m | max: 0.2844
last: 0.2604 | +| 0.1 | 171.30 | 58.17 | \ | 109.12 | 3h 53m | 8h 37m | 14h 25m | 19h 59m | max: 0.3542
last: 0.2979 | +| 0.3 | 146.11 | 38.88 | \ | 103.22 | 3h 18m | 6h 49m | 11h 40m | 17h 20m | max: 0.3469
last: 0.2865 | +| 0.5 | 150.63 | 33.14 | \ | 113.16 | 3h 13m | 6h 46m | 10h 53m | 17h 22m | max: 0.3521
last: 0.3094 | + +> source data: https://wandb.ai/hou-zg-meituan/fully-async-policy?nw=nwuserhouzg + +### 128-card 7B require_batches Ablation Experiment + +In multiple tests, we found that the number of samples issued each time in streaming affects the response length during +training, which in turn affects training time. We verified the impact on results by modifying +`async_training.require_batches`. + +| require_batches | step | gen | old_log_prob | update_actor | total time
100 step | total time
200 step | total time
300 step | acc/mean@1 | +|:-----------------:|:--------:|:-------:|:--------------:|:--------------:|:------------------------:|:------------------------:|:------------------------:|:-----------------------------:| +| 1 | 203.47 | 30.88 | \ | 181.08 | 3h 31m | 8h 29m | 17h 36m | max: 0.349
last: 0.326 | +| 2 | 158.72 | 26.32 | \ | 128.08 | 3h 35m | 7h 38m | 13h 57m | max: 0.351
last: 0.3406 | +| 4 | 124.64 | 25.62 | \ | 95.06 | 3h 13m | 6h 46m | 10h 53m | max: 0.3521
last: 0.3521 | + +> source data: https://wandb.ai/hou-zg-meituan/fully-async-policy?nw=nwuserhouzg + +### 30B Model Mode Experiment + +TODO: The 30B experiment is still in progress. + +* Machine: H20 +* Model: Qwen2.5-32B +* Rollout length: max_response_length FSDP2: 20K tokens; +* Algorithm: DAPO +* Engine: vllm+FSDP2 +* rollout.n: 16 +* ppo_mini_batch_size: 32 +* test_freq: 20 + +* colocate sync: + * step:200 + * train_batch_size: 512 + +* fully_async_policy + * total_rollout_steps: 512*200 + * trigger_parameter_sync_step: 512/32 = 16 + * staleness_threshold: 0 + * partial_rollout: False + +| training mode | Resource allocation | mode | step | generate_sequences | old_log_prob | update_actor | total time | acc/best@32/mean | +|--------------------|---------------------|--------------------------------------------|------|--------------------|--------------|--------------|------------|------------------| +| colocate sync | 128 | | | | | | | | +| fully_async_policy | 64:64 | stream off policy pipeline | | | | | | | +| fully_async_policy | 64:64 | async stream pipeline with stale samples | | | | | | | +| fully_async_policy | 64:64 | async stream pipeline with partial rollout | | | | | | | + +> source data: https://wandb.ai/hou-zg-meituan/fully-async-policy?nw=nwuserhouzg + +## Future Plans + +* GRPO experiments +* Megatron adaptation +* SGLang integration +* Transfer queue integration +* Asynchronous parameter synchronization +* AReaL asynchronous algorithm implementation +* TPPO algorithm implementation +* Multi-turn and Tool support \ No newline at end of file diff --git a/recipe/fully_async_policy/README_zh.md b/recipe/fully_async_policy/README_zh.md index 87f75db93a4..e6751213841 100644 --- a/recipe/fully_async_policy/README_zh.md +++ b/recipe/fully_async_policy/README_zh.md @@ -5,6 +5,7 @@ Last updated: 10/16/2025. 本文档介绍了完全异步PPO训练系统,该系统实现了 Trainer 和 Rollouter 的完全解耦,支持异步样本生成和训练。 +在该系统下,我们使用128卡训练qwen2.5-7B模型取得了2.35x-2.67x的性能提升,同时效果没有显著受到影响。 ## Introduction @@ -124,12 +125,11 @@ https://github.com/ArronHZG/verl-community/blob/recipe/async_policy/docs/fully_a 即 old_log_prob必须使用rollout参数及token所对应log_probs,才能保证算法的正确性。在fully async策略中,我们默认old_log_prob是有rollout所计算的,而不是由trainer所计算。 - * `async_training.require_batches` - + * `async_training.require_batches` + 在流式训练中,require_batches 应该设置为1,表示生产够ppo_mini_batch_size样本后,就进行训练。 在实际测试中,我们发现,如果单次下发的样本较少,由于数据分发的顺序,会导致训练不稳定,response 长度变长。 在这里,我们额外提供 require_batches 进行流式分发,单次参与训练的样本数量控制。 - ### 模式支持 @@ -252,7 +252,8 @@ python -m recipe.fully_async_policy.fully_async_main \ ### 在7B模型上进行异步训练 -我们使用 Qwen2.5-Math-7B 验证 fully async 策略在长候选下,各个资源的收益。 +我们使用 Qwen2.5-Math-7B 验证 fully async 策略在长候选下,多种资源下的收益情况。 +使用`async stream pipeline with staleness samples` 策略,我们在32卡,64卡,128卡都取得2x左右的性能提升,同时没有显著影响实验效果。 * 机器:H20 * 模型:Qwen2.5-Math-7B @@ -275,49 +276,54 @@ python -m recipe.fully_async_policy.fully_async_main \ * staleness_threshold: 0.3 * partial_rollout: True -| training mode | resource allocation | step | gen | old_log_prob | update_actor | total time
100 step | total time
200 step | total time
300 step | total time
400 step | acc/mean@1 | -|:------------------: |:-------------------: |:------: |:------: |:------------: |:------------: |:----------------------: |:----------------------: |:----------------------: |:----------------------: |:----------------------------: | -| colocate sync | 32 | 790.10 | 357.41 | 107.71 | 313.81 | 13h 44m | 1d 3h 43m | 2d 9h 22m | 3d 17h 5m | max: 0.3313
last: 0.2448 | -| fully_async_policy | 16:16 | | | \ | | | | | | max:
last: | -| colocate sync | 64 | 365.28 | 150.72 | 70.26 | 133.41 | 10h 22m | 20h 45m | 1d 7h 6m | 1d 17h 32m | max: 0.3365
last: 0.2333 | -| fully_async_policy | 32:32 | 189.26 | 28.46 | \ | 156.98 | 4h 57m
(2.09x) | 10h 14m
(2.03x) | 16h 58m
(1.83x) | 21h 40m
(1.92x) | max: 0.3677
last: 0.3406 | -| colocate sync | 128 | 356.30 | 177.85 | 53.92 | 113.81 | 8h 36m | 17h 56m | 1d 5h 6m | 1d 16h 48m | max: 0.3573
last: 0.2958 | -| fully_async_policy | 64:64 | 150.63 | 33.14 | \ | 113.16 | 3h 13m
(2.67x) | 6h 46m
(2.65x) | 10h 53m
(2.67x) | 17h 22m
(2.35x) | max: 0.3521
last: 0.3094 | +| training mode | resource allocation | step | gen | old_log_prob | update_actor | total time
100 step | total time
200 step | total time
300 step | total time
400 step | acc/mean@1 | +|:--------------------:|:---------------------:|:--------:|:--------:|:--------------:|:--------------:|:------------------------:|:------------------------:|:------------------------:|:------------------------:|:------------------------------:| +| colocate sync | 32 | 790.10 | 357.41 | 107.71 | 313.81 | 13h 44m | 1d 3h 43m | 2d 9h 22m | 3d 17h 5m | max: 0.3313
last: 0.2448 | +| fully_async_policy | 16:16 | | | \ | | | | | | max:
last: | +| colocate sync | 64 | 365.28 | 150.72 | 70.26 | 133.41 | 10h 22m | 20h 45m | 1d 7h 6m | 1d 17h 32m | max: 0.3365
last: 0.2333 | +| fully_async_policy | 32:32 | 189.26 | 28.46 | \ | 156.98 | 4h 57m
(2.09x) | 10h 14m
(2.03x) | 16h 58m
(1.83x) | 21h 40m
(1.92x) | max: 0.3677
last: 0.3406 | +| colocate sync | 128 | 356.30 | 177.85 | 53.92 | 113.81 | 8h 36m | 17h 56m | 1d 5h 6m | 1d 16h 48m | max: 0.3573
last: 0.2958 | +| fully_async_policy | 64:64 | 150.63 | 33.14 | \ | 113.16 | 3h 13m
(2.67x) | 6h 46m
(2.65x) | 10h 53m
(2.67x) | 17h 22m
(2.35x) | max: 0.3521
last: 0.3094 | > source data: https://wandb.ai/hou-zg-meituan/fully-async-policy?nw=nwuserhouzg ### 128卡 7B 异步模式实验 -我们使用 Qwen2.5-Math-7B 验证 fully async 所支持的各个模型的效果。 +我们使用 Qwen2.5-Math-7B 验证 fully async 所支持的各个模式的效果。 -| mode | step | gen | old_log_prob | update_actor | total time
100 step | total time
200 step | total time
300 step | total time
400 step | acc/mean@1 | -|:-------------------------------------------------------------------------------------: |:------: |:------: |:------------: |:------------: |:----------------------: |:----------------------: |:----------------------: |:----------------------: |:---------------------------: | -| `stream off policy pipeline`
(trigger_parameter_sync_step= 4,
require_batches= 4) | 231.34 | 128.47 | \ | 98.77 | 4h 25m | 9h 41m | 15h 2m | 1d 1h 53m | max: 0.2844
last: 0.2604 | -| `async stream pipeline with staleness samples`
(+staleness_threshold=0.5) | | | | | | | | | | -| `async stream pipeline with partial rollout`
(+partial_rollout=True) | 150.63 | 33.14 | \ | 113.16 | 3h 13m | 6h 46m | 10h 53m | 17h 22m | max: 0.3521
last: 0.3094 | +| mode | step | gen | old_log_prob | update_actor | total time
100 step | total time
200 step | total time
300 step | total time
400 step | acc/mean@1 | +|:-------------------------------------------------------------------------------------------------------:|:---------------------:|:--------:|:--------------:|:--------------:|:------------------------:|:------------------------:|:------------------------:|:------------------------:|:-----------------------------:| +| colocate sync | 128 | 356.30 | 177.85 | 53.92 | 113.81 | 8h 36m | 17h 56m | 1d 5h 6m | 1d 16h 48m | max: 0.3573
last: 0.2958 | +| `stream off policy pipeline`
(+fully async: trigger_parameter_sync_step= 4,
require_batches= 4) | 231.34 | 128.47 | \ | 98.77 | 4h 25m | 9h 41m | 15h 2m | 1d 1h 53m | max: 0.2844
last: 0.2604 | +| `async stream pipeline with staleness samples`
(+staleness_threshold=0.5) | | | | | | | | | | +| `async stream pipeline with partial rollout`
(+partial_rollout=True) | 150.63 | 33.14 | \ | 113.16 | 3h 13m | 6h 46m | 10h 53m | 17h 22m | max: 0.3521
last: 0.3094 | ### 128卡 stale 消融实验 在 `async stream pipeline with partial rollout` 模式下,我们验证 staleness 的设置对于训练效率的影响。 +我们可以发现,staleness 越大,最终取得的收益越明显。 +同时我们也注意到 staleness 取 0.3 和 0.5 的时间比较接近,原因是随着训练步数的增量,response 长度变化较大,训练出现了不稳定的问题。 +后续还需要针对该问题进行进一步的分析和优化。 -| staleness_threshold | step | gen | old_log_prob | update_actor | total time
100 step | total time
200 step | total time
300 step | total time
400 step | acc/mean@1 | -|:-------------------: |:------: |:------: |:------------: |:------------: |:----------------------: |:----------------------: |:----------------------: |:----------------------: |:---------------------------: | -| 0 | 231.34 | 128.47 | \ | 98.77 | 4h 25m | 9h 41m | 15h 2m | 1d 1h 53m | max: 0.2844
last: 0.2604 | -| 0.1 | 171.30 | 58.17 | \ | 109.12 | 3h 53m | 8h 37m | 14h 25m | 19h 59m | max: 0.3542
last: 0.2979 | -| 0.3 | 146.11 | 38.88 | \ | 103.22 | 3h 18m | 6h 49m | 11h 40m | 17h 20m | max: 0.3469
last: 0.2865 | -| 0.5 | 150.63 | 33.14 | \ | 113.16 | 3h 13m | 6h 46m | 10h 53m | 17h 22m | max: 0.3521
last: 0.3094 | +| staleness_threshold | step | gen | old_log_prob | update_actor | total time
100 step | total time
200 step | total time
300 step | total time
400 step | acc/mean@1 | +|:---------------------:|:--------:|:--------:|:--------------:|:--------------:|:------------------------:|:------------------------:|:------------------------:|:------------------------:|:-----------------------------:| +| 0 | 231.34 | 128.47 | \ | 98.77 | 4h 25m | 9h 41m | 15h 2m | 1d 1h 53m | max: 0.2844
last: 0.2604 | +| 0.1 | 171.30 | 58.17 | \ | 109.12 | 3h 53m | 8h 37m | 14h 25m | 19h 59m | max: 0.3542
last: 0.2979 | +| 0.3 | 146.11 | 38.88 | \ | 103.22 | 3h 18m | 6h 49m | 11h 40m | 17h 20m | max: 0.3469
last: 0.2865 | +| 0.5 | 150.63 | 33.14 | \ | 113.16 | 3h 13m | 6h 46m | 10h 53m | 17h 22m | max: 0.3521
last: 0.3094 | > source data: https://wandb.ai/hou-zg-meituan/fully-async-policy?nw=nwuserhouzg ### 128卡 7B require_batches 消融实验 -在多次测试下,我们发现流式每次下发样本的数量,会影响训练的结果,我们通过修改 `async_training.require_batches` 验证对与结果的影响。 +在多次测试下,我们发现流式每次下发样本的数量会影响训练的response长度,进而影响训练时长,我们通过修改 +`async_training.require_batches` 验证对与结果的影响。 -| require_batches | step | gen | old_log_prob | update_actor | total time
100 step | total time
200 step | total time
300 step | acc/mean@1 | -|:---------------: |:------: |:-----: |:------------: |:------------: |:----------------------: |:----------------------: |:----------------------: |:---------------------------: | -| 1 | 203.47 | 30.88 | \ | 181.08 | 3h 31m | 8h 29m | 17h 36m | max: 0.349
last: 0.326 | -| 2 | 158.72 | 26.32 | \ | 128.08 | 3h 35m | 7h 38m | 13h 57m | max: 0.351
last: 0.3406 | -| 4 | 124.64 | 25.62 | \ | 95.06 | 3h 13m | 6h 46m | 10h 53m | max: 0.3521
last: 0.3521 | +| require_batches | step | gen | old_log_prob | update_actor | total time
100 step | total time
200 step | total time
300 step | acc/mean@1 | +|:-----------------:|:--------:|:-------:|:--------------:|:--------------:|:------------------------:|:------------------------:|:------------------------:|:-----------------------------:| +| 1 | 203.47 | 30.88 | \ | 181.08 | 3h 31m | 8h 29m | 17h 36m | max: 0.349
last: 0.326 | +| 2 | 158.72 | 26.32 | \ | 128.08 | 3h 35m | 7h 38m | 13h 57m | max: 0.351
last: 0.3406 | +| 4 | 124.64 | 25.62 | \ | 95.06 | 3h 13m | 6h 46m | 10h 53m | max: 0.3521
last: 0.3521 | > source data: https://wandb.ai/hou-zg-meituan/fully-async-policy?nw=nwuserhouzg diff --git a/verl/workers/actor/dp_actor.py b/verl/workers/actor/dp_actor.py index 5955dfc33ed..7dd531ad266 100644 --- a/verl/workers/actor/dp_actor.py +++ b/verl/workers/actor/dp_actor.py @@ -429,7 +429,6 @@ def update_policy(self, data: DataProto): # for fully_async_policy recipe if hasattr(self.config, "use_rollout_log_probs") and self.config.use_rollout_log_probs: - print("for fully_async_policy recipe") old_log_prob = model_inputs["old_log_probs"] else: if on_policy: From de055102c9451852c0384756321bcfe8f9f71958 Mon Sep 17 00:00:00 2001 From: ArronHZG Date: Thu, 16 Oct 2025 19:26:16 +0800 Subject: [PATCH 176/182] update readme --- docs/advance/fully_async.md | 11 ++++++----- recipe/fully_async_policy/README.md | 11 ++++++----- recipe/fully_async_policy/README_zh.md | 16 ++++++++++------ 3 files changed, 22 insertions(+), 16 deletions(-) diff --git a/docs/advance/fully_async.md b/docs/advance/fully_async.md index 77498131e45..6dac051922c 100644 --- a/docs/advance/fully_async.md +++ b/docs/advance/fully_async.md @@ -336,7 +336,7 @@ Using the `async stream pipeline with stale samples` strategy, we achieved about | colocate sync | 128 | 356.30 | 177.85 | 53.92 | 113.81 | 8h 36m | 17h 56m | 1d 5h 6m | 1d 16h 48m | max: 0.3573
last: 0.2958 | | fully_async_policy | 64:64 | 150.63 | 33.14 | \ | 113.16 | 3h 13m
(2.67x) | 6h 46m
(2.65x) | 10h 53m
(2.67x) | 17h 22m
(2.35x) | max: 0.3521
last: 0.3094 | -> source data: https://wandb.ai/hou-zg-meituan/fully-async-policy?nw=nwuserhouzg +> source data: https://wandb.ai/hou-zg-meituan/fully-async-policy-colocate_async?nw=nwuserhouzg ### 128-card 7B Asynchronous Mode Experiment @@ -351,6 +351,8 @@ partial_rollout, the benefit reaches 2.35x. | `async stream pipeline with stale samples`
(+staleness_threshold=0.5) | | | | | | | | | | | `async stream pipeline with partial rollout`
(+partial_rollout=True) | 150.63 | 33.14 | \ | 113.16 | 3h 13m | 6h 46m | 10h 53m | 17h 22m | max: 0.3521
last: 0.3094 | +> source data: https://wandb.ai/hou-zg-meituan/fully-async-policy-stream_stale_partial?nw=nwuserhouzg + ### 128-card Stale Ablation Experiment Under the `async stream pipeline with partial rollout` mode, we verified the impact of staleness settings on training @@ -367,7 +369,7 @@ Further analysis and optimization are needed for this issue. | 0.3 | 146.11 | 38.88 | \ | 103.22 | 3h 18m | 6h 49m | 11h 40m | 17h 20m | max: 0.3469
last: 0.2865 | | 0.5 | 150.63 | 33.14 | \ | 113.16 | 3h 13m | 6h 46m | 10h 53m | 17h 22m | max: 0.3521
last: 0.3094 | -> source data: https://wandb.ai/hou-zg-meituan/fully-async-policy?nw=nwuserhouzg +> source data: https://wandb.ai/hou-zg-meituan/fully-async-policy-stream_stale_partial?nw=nwuserhouzg ### 128-card 7B require_batches Ablation Experiment @@ -381,14 +383,14 @@ training, which in turn affects training time. We verified the impact on results | 2 | 158.72 | 26.32 | \ | 128.08 | 3h 35m | 7h 38m | 13h 57m | max: 0.351
last: 0.3406 | | 4 | 124.64 | 25.62 | \ | 95.06 | 3h 13m | 6h 46m | 10h 53m | max: 0.3521
last: 0.3521 | -> source data: https://wandb.ai/hou-zg-meituan/fully-async-policy?nw=nwuserhouzg +> source data: https://wandb.ai/hou-zg-meituan/fully-async-policy-ablation_require_batches?nw=nwuserhouzg ### 30B Model Mode Experiment TODO: The 30B experiment is still in progress. * Machine: H20 -* Model: Qwen2.5-32B +* Model: Qwen2.5-32B~~~~ * Rollout length: max_response_length FSDP2: 20K tokens; * Algorithm: DAPO * Engine: vllm+FSDP2 @@ -413,7 +415,6 @@ TODO: The 30B experiment is still in progress. | fully_async_policy | 64:64 | async stream pipeline with stale samples | | | | | | | | fully_async_policy | 64:64 | async stream pipeline with partial rollout | | | | | | | -> source data: https://wandb.ai/hou-zg-meituan/fully-async-policy?nw=nwuserhouzg ## Future Plans diff --git a/recipe/fully_async_policy/README.md b/recipe/fully_async_policy/README.md index 77498131e45..6dac051922c 100644 --- a/recipe/fully_async_policy/README.md +++ b/recipe/fully_async_policy/README.md @@ -336,7 +336,7 @@ Using the `async stream pipeline with stale samples` strategy, we achieved about | colocate sync | 128 | 356.30 | 177.85 | 53.92 | 113.81 | 8h 36m | 17h 56m | 1d 5h 6m | 1d 16h 48m | max: 0.3573
last: 0.2958 | | fully_async_policy | 64:64 | 150.63 | 33.14 | \ | 113.16 | 3h 13m
(2.67x) | 6h 46m
(2.65x) | 10h 53m
(2.67x) | 17h 22m
(2.35x) | max: 0.3521
last: 0.3094 | -> source data: https://wandb.ai/hou-zg-meituan/fully-async-policy?nw=nwuserhouzg +> source data: https://wandb.ai/hou-zg-meituan/fully-async-policy-colocate_async?nw=nwuserhouzg ### 128-card 7B Asynchronous Mode Experiment @@ -351,6 +351,8 @@ partial_rollout, the benefit reaches 2.35x. | `async stream pipeline with stale samples`
(+staleness_threshold=0.5) | | | | | | | | | | | `async stream pipeline with partial rollout`
(+partial_rollout=True) | 150.63 | 33.14 | \ | 113.16 | 3h 13m | 6h 46m | 10h 53m | 17h 22m | max: 0.3521
last: 0.3094 | +> source data: https://wandb.ai/hou-zg-meituan/fully-async-policy-stream_stale_partial?nw=nwuserhouzg + ### 128-card Stale Ablation Experiment Under the `async stream pipeline with partial rollout` mode, we verified the impact of staleness settings on training @@ -367,7 +369,7 @@ Further analysis and optimization are needed for this issue. | 0.3 | 146.11 | 38.88 | \ | 103.22 | 3h 18m | 6h 49m | 11h 40m | 17h 20m | max: 0.3469
last: 0.2865 | | 0.5 | 150.63 | 33.14 | \ | 113.16 | 3h 13m | 6h 46m | 10h 53m | 17h 22m | max: 0.3521
last: 0.3094 | -> source data: https://wandb.ai/hou-zg-meituan/fully-async-policy?nw=nwuserhouzg +> source data: https://wandb.ai/hou-zg-meituan/fully-async-policy-stream_stale_partial?nw=nwuserhouzg ### 128-card 7B require_batches Ablation Experiment @@ -381,14 +383,14 @@ training, which in turn affects training time. We verified the impact on results | 2 | 158.72 | 26.32 | \ | 128.08 | 3h 35m | 7h 38m | 13h 57m | max: 0.351
last: 0.3406 | | 4 | 124.64 | 25.62 | \ | 95.06 | 3h 13m | 6h 46m | 10h 53m | max: 0.3521
last: 0.3521 | -> source data: https://wandb.ai/hou-zg-meituan/fully-async-policy?nw=nwuserhouzg +> source data: https://wandb.ai/hou-zg-meituan/fully-async-policy-ablation_require_batches?nw=nwuserhouzg ### 30B Model Mode Experiment TODO: The 30B experiment is still in progress. * Machine: H20 -* Model: Qwen2.5-32B +* Model: Qwen2.5-32B~~~~ * Rollout length: max_response_length FSDP2: 20K tokens; * Algorithm: DAPO * Engine: vllm+FSDP2 @@ -413,7 +415,6 @@ TODO: The 30B experiment is still in progress. | fully_async_policy | 64:64 | async stream pipeline with stale samples | | | | | | | | fully_async_policy | 64:64 | async stream pipeline with partial rollout | | | | | | | -> source data: https://wandb.ai/hou-zg-meituan/fully-async-policy?nw=nwuserhouzg ## Future Plans diff --git a/recipe/fully_async_policy/README_zh.md b/recipe/fully_async_policy/README_zh.md index e6751213841..ea0e8c14679 100644 --- a/recipe/fully_async_policy/README_zh.md +++ b/recipe/fully_async_policy/README_zh.md @@ -279,17 +279,18 @@ python -m recipe.fully_async_policy.fully_async_main \ | training mode | resource allocation | step | gen | old_log_prob | update_actor | total time
100 step | total time
200 step | total time
300 step | total time
400 step | acc/mean@1 | |:--------------------:|:---------------------:|:--------:|:--------:|:--------------:|:--------------:|:------------------------:|:------------------------:|:------------------------:|:------------------------:|:------------------------------:| | colocate sync | 32 | 790.10 | 357.41 | 107.71 | 313.81 | 13h 44m | 1d 3h 43m | 2d 9h 22m | 3d 17h 5m | max: 0.3313
last: 0.2448 | -| fully_async_policy | 16:16 | | | \ | | | | | | max:
last: | +| fully_async_policy | 16:16 | | | \ | | | | | | max:
last: | | colocate sync | 64 | 365.28 | 150.72 | 70.26 | 133.41 | 10h 22m | 20h 45m | 1d 7h 6m | 1d 17h 32m | max: 0.3365
last: 0.2333 | | fully_async_policy | 32:32 | 189.26 | 28.46 | \ | 156.98 | 4h 57m
(2.09x) | 10h 14m
(2.03x) | 16h 58m
(1.83x) | 21h 40m
(1.92x) | max: 0.3677
last: 0.3406 | | colocate sync | 128 | 356.30 | 177.85 | 53.92 | 113.81 | 8h 36m | 17h 56m | 1d 5h 6m | 1d 16h 48m | max: 0.3573
last: 0.2958 | | fully_async_policy | 64:64 | 150.63 | 33.14 | \ | 113.16 | 3h 13m
(2.67x) | 6h 46m
(2.65x) | 10h 53m
(2.67x) | 17h 22m
(2.35x) | max: 0.3521
last: 0.3094 | -> source data: https://wandb.ai/hou-zg-meituan/fully-async-policy?nw=nwuserhouzg +> source data: https://wandb.ai/hou-zg-meituan/fully-async-policy-colocate_async?nw=nwuserhouzg ### 128卡 7B 异步模式实验 我们使用 Qwen2.5-Math-7B 验证 fully async 所支持的各个模式的效果。 +我们可以看到 stream 带来的收益大约0.6x,叠加 staleness 和 partial_rollout 后,收益为2.35x。 | mode | step | gen | old_log_prob | update_actor | total time
100 step | total time
200 step | total time
300 step | total time
400 step | acc/mean@1 | |:-------------------------------------------------------------------------------------------------------:|:---------------------:|:--------:|:--------------:|:--------------:|:------------------------:|:------------------------:|:------------------------:|:------------------------:|:-----------------------------:| @@ -298,6 +299,8 @@ python -m recipe.fully_async_policy.fully_async_main \ | `async stream pipeline with staleness samples`
(+staleness_threshold=0.5) | | | | | | | | | | | `async stream pipeline with partial rollout`
(+partial_rollout=True) | 150.63 | 33.14 | \ | 113.16 | 3h 13m | 6h 46m | 10h 53m | 17h 22m | max: 0.3521
last: 0.3094 | +> source data: https://wandb.ai/hou-zg-meituan/fully-async-policy-stream_stale_partial?nw=nwuserhouzg + ### 128卡 stale 消融实验 在 `async stream pipeline with partial rollout` 模式下,我们验证 staleness 的设置对于训练效率的影响。 @@ -312,9 +315,9 @@ python -m recipe.fully_async_policy.fully_async_main \ | 0.3 | 146.11 | 38.88 | \ | 103.22 | 3h 18m | 6h 49m | 11h 40m | 17h 20m | max: 0.3469
last: 0.2865 | | 0.5 | 150.63 | 33.14 | \ | 113.16 | 3h 13m | 6h 46m | 10h 53m | 17h 22m | max: 0.3521
last: 0.3094 | -> source data: https://wandb.ai/hou-zg-meituan/fully-async-policy?nw=nwuserhouzg +> source data: https://wandb.ai/hou-zg-meituan/fully-async-policy-ablation_stale?nw=nwuserhouzg -### 128卡 7B require_batches 消融实验 +### 128卡 7B require_batches 消融实验 在多次测试下,我们发现流式每次下发样本的数量会影响训练的response长度,进而影响训练时长,我们通过修改 `async_training.require_batches` 验证对与结果的影响。 @@ -325,10 +328,12 @@ python -m recipe.fully_async_policy.fully_async_main \ | 2 | 158.72 | 26.32 | \ | 128.08 | 3h 35m | 7h 38m | 13h 57m | max: 0.351
last: 0.3406 | | 4 | 124.64 | 25.62 | \ | 95.06 | 3h 13m | 6h 46m | 10h 53m | max: 0.3521
last: 0.3521 | -> source data: https://wandb.ai/hou-zg-meituan/fully-async-policy?nw=nwuserhouzg +> source data: https://wandb.ai/hou-zg-meituan/fully-async-policy-ablation_require_batches?nw=nwuserhouzg ### 30B模型模式实验 +TODO: 30B 的实验,还在完善中。 + * 机器: H20 * 模型:Qwen2.5-32B * rollout长度:max_response_length FSDP2: 20K tokens; @@ -355,7 +360,6 @@ python -m recipe.fully_async_policy.fully_async_main \ | fully_async_policy | 64:64 | async stream pipeline with staleness samples | | | | | | | | fully_async_policy | 64:64 | async stream pipeline with partial rollout | | | | | | | -> source data: https://wandb.ai/hou-zg-meituan/fully-async-policy?nw=nwuserhouzg ## 后续计划 From 4e122bfd709fd0bcddedbe0a8d576649428a1b68 Mon Sep 17 00:00:00 2001 From: wangshulin02 <953550366@qq.com> Date: Fri, 17 Oct 2025 10:46:15 +0800 Subject: [PATCH 177/182] update shell script --- ...2_64_64.sh => dapo_7b_math_fsdp2_16-16.sh} | 30 ++-- .../shell/dapo_7b_math_fsdp2_32_32.sh | 162 ++++++++++++++++++ .../shell/dapo_7b_math_fsdp2_4_12.sh | 12 +- .../shell/dapo_7b_math_fsdp2_4_4.sh | 16 +- .../shell/dapo_7b_math_fsdp2_64_64.sh | 20 +-- .../shell/dapo_7b_math_fsdp2_8_8.sh | 16 +- 6 files changed, 200 insertions(+), 56 deletions(-) rename recipe/fully_async_policy/shell/{dapo-32B_fsdp2_64_64.sh => dapo_7b_math_fsdp2_16-16.sh} (89%) create mode 100644 recipe/fully_async_policy/shell/dapo_7b_math_fsdp2_32_32.sh diff --git a/recipe/fully_async_policy/shell/dapo-32B_fsdp2_64_64.sh b/recipe/fully_async_policy/shell/dapo_7b_math_fsdp2_16-16.sh similarity index 89% rename from recipe/fully_async_policy/shell/dapo-32B_fsdp2_64_64.sh rename to recipe/fully_async_policy/shell/dapo_7b_math_fsdp2_16-16.sh index 324a7d9470e..82072c3a0eb 100644 --- a/recipe/fully_async_policy/shell/dapo-32B_fsdp2_64_64.sh +++ b/recipe/fully_async_policy/shell/dapo_7b_math_fsdp2_16-16.sh @@ -2,7 +2,7 @@ set -xeuo pipefail project_name='DAPO' -exp_name='dapo_qwen2-32B_20k_fsdp2_fully-async_64-64' +exp_name='dapo_qwen2-7B-math_28k_fsdp2_fully-async_16-16' # Ray # RAY_ADDRESS=${RAY_ADDRESS:-"http://localhost:8265"} @@ -16,11 +16,6 @@ CKPTS_DIR=${CKPTS_DIR:-"${RAY_DATA_HOME}/ckpts/${project_name}/${exp_name}"} TRAIN_FILE=${TRAIN_FILE:-"${RAY_DATA_HOME}/data/dapo-math-17k.parquet"} TEST_FILE=${TEST_FILE:-"${RAY_DATA_HOME}/data/aime-2024.parquet"} -MODEL_PATH=/mnt/dolphinfs/ssd_pool/docker/user/hadoop-friday-studio/FTI/houzhenggang/model/Qwen2___5-Math-7B -CKPTS_DIR=./ckpts/${project_name}/${exp_name} -TRAIN_FILE=/mnt/dolphinfs/ssd_pool/docker/user/hadoop-friday-studio/FTI/houzhenggang/data/dapo/dapo-math-17k.parquet -TEST_FILE=/mnt/dolphinfs/ssd_pool/docker/user/hadoop-friday-studio/FTI/houzhenggang/data/dapo/aime-2024.parquet - rollout_mode="async" rollout_name="vllm" # sglang or vllm if [ "$rollout_mode" = "async" ]; then @@ -41,7 +36,7 @@ clip_ratio_high=0.28 # Response length parameters max_prompt_length=$((1024 * 2)) -max_response_length=$((1024 * 20)) +max_response_length=$((1024 * 28)) enable_overlong_buffer=True overlong_buffer_len=$((1024 * 4)) overlong_penalty_factor=1.0 @@ -62,23 +57,24 @@ infer_ppo_max_token_len=$(((max_prompt_length + max_response_length) * 3)) ref_offload=True actor_offload=False gen_tp=4 -sp_size=8 -fsdp_size=-1 +sp_size=4 +fsdp_size=8 # Fully async specific parameters -NNODES_ROLLOUT=${NNODES_ROLLOUT:-8} -NNODES_TRAIN=${NNODES_TRAIN:-8} +NNODES_ROLLOUT=${NNODES_ROLLOUT:-2} +NNODES_TRAIN=${NNODES_TRAIN:-2} NGPUS_PER_NODE=${NGPUS_PER_NODE:-8} train_prompt_bsz=0 gen_prompt_bsz=1 n_resp_per_prompt=16 train_prompt_mini_bsz=32 -total_rollout_steps=$(((512*200))) +total_rollout_steps=$(((512*400))) test_freq=20 -staleness_threshold=0 -trigger_parameter_sync_step=16 -partial_rollout=False +staleness_threshold=0.1 +trigger_parameter_sync_step=4 +require_batches=4 +partial_rollout=True python -m recipe.fully_async_policy.fully_async_main \ data.train_files="${TRAIN_FILE}" \ @@ -161,4 +157,6 @@ python -m recipe.fully_async_policy.fully_async_main \ rollout.test_freq="${test_freq}" \ async_training.staleness_threshold="${staleness_threshold}" \ async_training.trigger_parameter_sync_step="${trigger_parameter_sync_step}" \ - async_training.partial_rollout="${partial_rollout}" + async_training.require_batches="${require_batches}" \ + async_training.partial_rollout="${partial_rollout}" \ + async_training.use_rollout_log_probs=True diff --git a/recipe/fully_async_policy/shell/dapo_7b_math_fsdp2_32_32.sh b/recipe/fully_async_policy/shell/dapo_7b_math_fsdp2_32_32.sh new file mode 100644 index 00000000000..ded0b0d42cd --- /dev/null +++ b/recipe/fully_async_policy/shell/dapo_7b_math_fsdp2_32_32.sh @@ -0,0 +1,162 @@ +#!/usr/bin/env bash +set -xeuo pipefail + +project_name='DAPO' +exp_name='dapo_qwen2-7B-math_28k_fsdp2_fully-async_32-32' + +# Ray +# RAY_ADDRESS=${RAY_ADDRESS:-"http://localhost:8265"} +# WORKING_DIR=${WORKING_DIR:-"${PWD}"} +# RUNTIME_ENV=${RUNTIME_ENV:-"${WORKING_DIR}/verl/trainer/runtime_env.yaml"} +# Paths +RAY_DATA_HOME=${RAY_DATA_HOME:-"${HOME}/verl"} +# very important! please modify the max_position_embeddings in config.json to 32768 after downloading from huggingface +MODEL_PATH=${MODEL_PATH:-"${RAY_DATA_HOME}/models/Qwen2.5-Math-7B"} +CKPTS_DIR=${CKPTS_DIR:-"${RAY_DATA_HOME}/ckpts/${project_name}/${exp_name}"} +TRAIN_FILE=${TRAIN_FILE:-"${RAY_DATA_HOME}/data/dapo-math-17k.parquet"} +TEST_FILE=${TEST_FILE:-"${RAY_DATA_HOME}/data/aime-2024.parquet"} + +rollout_mode="async" +rollout_name="vllm" # sglang or vllm +if [ "$rollout_mode" = "async" ]; then + export VLLM_USE_V1=1 + return_raw_chat="True" +fi + +# Algorithm parameters +adv_estimator=grpo + +use_kl_in_reward=False +kl_coef=0.0 +use_kl_loss=False +kl_loss_coef=0.0 + +clip_ratio_low=0.2 +clip_ratio_high=0.28 + +# Response length parameters +max_prompt_length=$((1024 * 2)) +max_response_length=$((1024 * 28)) +enable_overlong_buffer=True +overlong_buffer_len=$((1024 * 4)) +overlong_penalty_factor=1.0 + +# Training parameters +loss_agg_mode="token-mean" + +# Algorithm +temperature=1.0 +top_p=1.0 +top_k=-1 # 0 for HF rollout, -1 for vLLM rollout +val_top_p=0.7 + +# Performance Related Parameter +use_dynamic_bsz=True +actor_ppo_max_token_len=$(((max_prompt_length + max_response_length) * 2)) +infer_ppo_max_token_len=$(((max_prompt_length + max_response_length) * 3)) +ref_offload=True +actor_offload=False +gen_tp=4 +sp_size=4 +fsdp_size=8 + +# Fully async specific parameters +NNODES_ROLLOUT=${NNODES_ROLLOUT:-4} +NNODES_TRAIN=${NNODES_TRAIN:-4} +NGPUS_PER_NODE=${NGPUS_PER_NODE:-8} + +train_prompt_bsz=0 +gen_prompt_bsz=1 +n_resp_per_prompt=16 +train_prompt_mini_bsz=32 +total_rollout_steps=$(((512*400))) +test_freq=20 +staleness_threshold=0.1 +trigger_parameter_sync_step=4 +require_batches=4 +partial_rollout=True + +python -m recipe.fully_async_policy.fully_async_main \ + data.train_files="${TRAIN_FILE}" \ + data.val_files="${TEST_FILE}" \ + data.prompt_key=prompt \ + data.truncation='left' \ + data.max_prompt_length=${max_prompt_length} \ + data.max_response_length=${max_response_length} \ + data.train_batch_size=${train_prompt_bsz} \ + data.gen_batch_size=${gen_prompt_bsz} \ + data.return_raw_chat=${return_raw_chat} \ + actor_rollout_ref.rollout.n=${n_resp_per_prompt} \ + algorithm.adv_estimator=${adv_estimator} \ + algorithm.use_kl_in_reward=${use_kl_in_reward} \ + algorithm.kl_ctrl.kl_coef=${kl_coef} \ + actor_rollout_ref.actor.strategy=fsdp2 \ + critic.strategy=fsdp2 \ + actor_rollout_ref.actor.use_kl_loss=${use_kl_loss} \ + actor_rollout_ref.actor.kl_loss_coef=${kl_loss_coef} \ + actor_rollout_ref.actor.clip_ratio_low=${clip_ratio_low} \ + actor_rollout_ref.actor.clip_ratio_high=${clip_ratio_high} \ + actor_rollout_ref.actor.clip_ratio_c=10.0 \ + actor_rollout_ref.model.use_remove_padding=True \ + actor_rollout_ref.hybrid_engine=False \ + +actor_rollout_ref.model.override_config.max_position_embeddings=32768 \ + actor_rollout_ref.actor.use_dynamic_bsz=${use_dynamic_bsz} \ + actor_rollout_ref.ref.log_prob_use_dynamic_bsz=${use_dynamic_bsz} \ + actor_rollout_ref.rollout.log_prob_use_dynamic_bsz=${use_dynamic_bsz} \ + actor_rollout_ref.actor.ppo_max_token_len_per_gpu=${actor_ppo_max_token_len} \ + actor_rollout_ref.ref.log_prob_max_token_len_per_gpu=${infer_ppo_max_token_len} \ + actor_rollout_ref.rollout.log_prob_max_token_len_per_gpu=${infer_ppo_max_token_len} \ + actor_rollout_ref.model.path="${MODEL_PATH}" \ + actor_rollout_ref.actor.optim.lr=1e-6 \ + actor_rollout_ref.actor.optim.lr_warmup_steps=10 \ + actor_rollout_ref.actor.optim.weight_decay=0.1 \ + actor_rollout_ref.actor.ppo_mini_batch_size=${train_prompt_mini_bsz} \ + actor_rollout_ref.actor.fsdp_config.param_offload=${actor_offload} \ + actor_rollout_ref.actor.fsdp_config.optimizer_offload=${actor_offload} \ + actor_rollout_ref.actor.entropy_coeff=0 \ + actor_rollout_ref.actor.grad_clip=1.0 \ + actor_rollout_ref.actor.loss_agg_mode=${loss_agg_mode} \ + actor_rollout_ref.actor.ulysses_sequence_parallel_size=${sp_size} \ + actor_rollout_ref.rollout.gpu_memory_utilization=0.80 \ + actor_rollout_ref.rollout.tensor_model_parallel_size=${gen_tp} \ + actor_rollout_ref.rollout.enable_chunked_prefill=True \ + actor_rollout_ref.rollout.max_num_batched_tokens=$((max_prompt_length + max_response_length)) \ + actor_rollout_ref.rollout.temperature=${temperature} \ + actor_rollout_ref.rollout.top_p=${top_p} \ + actor_rollout_ref.rollout.top_k=${top_k} \ + actor_rollout_ref.rollout.val_kwargs.temperature=${temperature} \ + actor_rollout_ref.rollout.val_kwargs.top_p=${val_top_p} \ + actor_rollout_ref.rollout.val_kwargs.top_k=${top_k} \ + actor_rollout_ref.rollout.val_kwargs.do_sample=True \ + actor_rollout_ref.rollout.val_kwargs.n=1 \ + actor_rollout_ref.ref.fsdp_config.param_offload=${ref_offload} \ + actor_rollout_ref.ref.ulysses_sequence_parallel_size=${sp_size} \ + actor_rollout_ref.actor.fsdp_config.fsdp_size=${fsdp_size} \ + actor_rollout_ref.rollout.name=${rollout_name} \ + actor_rollout_ref.rollout.mode=${rollout_mode} \ + actor_rollout_ref.rollout.calculate_log_probs=True \ + reward_model.reward_manager=dapo \ + +reward_model.reward_kwargs.overlong_buffer_cfg.enable=${enable_overlong_buffer} \ + +reward_model.reward_kwargs.overlong_buffer_cfg.len=${overlong_buffer_len} \ + +reward_model.reward_kwargs.overlong_buffer_cfg.penalty_factor=${overlong_penalty_factor} \ + +reward_model.reward_kwargs.overlong_buffer_cfg.log=False \ + +reward_model.reward_kwargs.max_resp_len=${max_response_length} \ + trainer.logger=['console','tensorboard'] \ + trainer.project_name="${project_name}" \ + trainer.experiment_name="${exp_name}" \ + trainer.val_before_train=True \ + trainer.save_freq=-1 \ + trainer.default_local_dir="${CKPTS_DIR}" \ + trainer.resume_mode=auto \ + trainer.nnodes="${NNODES_TRAIN}" \ + trainer.n_gpus_per_node="${NGPUS_PER_NODE}" \ + rollout.nnodes="${NNODES_ROLLOUT}" \ + rollout.n_gpus_per_node="${NGPUS_PER_NODE}" \ + rollout.total_rollout_steps="${total_rollout_steps}" \ + rollout.total_epochs=10 \ + rollout.test_freq="${test_freq}" \ + async_training.staleness_threshold="${staleness_threshold}" \ + async_training.trigger_parameter_sync_step="${trigger_parameter_sync_step}" \ + async_training.require_batches="${require_batches}" \ + async_training.partial_rollout="${partial_rollout}" \ + async_training.use_rollout_log_probs=True diff --git a/recipe/fully_async_policy/shell/dapo_7b_math_fsdp2_4_12.sh b/recipe/fully_async_policy/shell/dapo_7b_math_fsdp2_4_12.sh index dbfbee8fdfc..18888fd161c 100644 --- a/recipe/fully_async_policy/shell/dapo_7b_math_fsdp2_4_12.sh +++ b/recipe/fully_async_policy/shell/dapo_7b_math_fsdp2_4_12.sh @@ -16,11 +16,6 @@ CKPTS_DIR=${CKPTS_DIR:-"${RAY_DATA_HOME}/ckpts/${project_name}/${exp_name}"} TRAIN_FILE=${TRAIN_FILE:-"${RAY_DATA_HOME}/data/dapo-math-17k.parquet"} TEST_FILE=${TEST_FILE:-"${RAY_DATA_HOME}/data/aime-2024.parquet"} -MODEL_PATH=/mnt/dolphinfs/ssd_pool/docker/user/hadoop-friday-studio/FTI/houzhenggang/model/Qwen2___5-Math-7B -CKPTS_DIR=./ckpts/${project_name}/${exp_name} -TRAIN_FILE=/mnt/dolphinfs/ssd_pool/docker/user/hadoop-friday-studio/FTI/houzhenggang/data/dapo/dapo-math-17k.parquet -TEST_FILE=/mnt/dolphinfs/ssd_pool/docker/user/hadoop-friday-studio/FTI/houzhenggang/data/dapo/aime-2024.parquet - rollout_mode="async" rollout_name="vllm" # sglang or vllm if [ "$rollout_mode" = "async" ]; then @@ -79,7 +74,8 @@ train_prompt_mini_bsz=32 total_rollout_steps=$(((512*100))) test_freq=10 staleness_threshold=0.1 -trigger_parameter_sync_step=16 +trigger_parameter_sync_step=4 +require_batches=4 partial_rollout=True python -m recipe.fully_async_policy.fully_async_main \ @@ -163,4 +159,6 @@ python -m recipe.fully_async_policy.fully_async_main \ rollout.total_epochs=10 \ async_training.staleness_threshold="${staleness_threshold}" \ async_training.trigger_parameter_sync_step="${trigger_parameter_sync_step}" \ - async_training.partial_rollout="${partial_rollout}" + async_training.require_batches="${require_batches}" \ + async_training.partial_rollout="${partial_rollout}" \ + async_training.use_rollout_log_probs=True \ No newline at end of file diff --git a/recipe/fully_async_policy/shell/dapo_7b_math_fsdp2_4_4.sh b/recipe/fully_async_policy/shell/dapo_7b_math_fsdp2_4_4.sh index 6f64caaea0a..bd56bdd424b 100644 --- a/recipe/fully_async_policy/shell/dapo_7b_math_fsdp2_4_4.sh +++ b/recipe/fully_async_policy/shell/dapo_7b_math_fsdp2_4_4.sh @@ -16,15 +16,6 @@ CKPTS_DIR=${CKPTS_DIR:-"${RAY_DATA_HOME}/ckpts/${project_name}/${exp_name}"} TRAIN_FILE=${TRAIN_FILE:-"${RAY_DATA_HOME}/data/dapo-math-17k.parquet"} TEST_FILE=${TEST_FILE:-"${RAY_DATA_HOME}/data/aime-2024.parquet"} -MODEL_PATH=/mnt/dolphinfs/ssd_pool/docker/user/hadoop-friday-studio/FTI/houzhenggang/model/Qwen2___5-Math-7B -CKPTS_DIR=./ckpts/${project_name}/${exp_name} -TRAIN_FILE=/mnt/dolphinfs/ssd_pool/docker/user/hadoop-friday-studio/FTI/houzhenggang/data/dapo/dapo-math-17k.parquet -TEST_FILE=/mnt/dolphinfs/ssd_pool/docker/user/hadoop-friday-studio/FTI/houzhenggang/data/dapo/aime-2024.parquet - -MODEL_PATH=/mnt/dolphinfs/hdd_pool/docker/user/hadoop-djst-algoplat/houzhenggang/modelscope/hub/models/Qwen/Qwen2___5-Math-7B -TRAIN_FILE=/mnt/dolphinfs/hdd_pool/docker/user/hadoop-djst-algoplat/houzhenggang/data/dapo/dapo-math-17k.parquet -TEST_FILE=/mnt/dolphinfs/hdd_pool/docker/user/hadoop-djst-algoplat/houzhenggang/data/dapo/aime-2024.parquet - rollout_mode="async" rollout_name="vllm" # sglang or vllm if [ "$rollout_mode" = "async" ]; then @@ -83,7 +74,8 @@ train_prompt_mini_bsz=32 total_rollout_steps=$(((512*100))) test_freq=10 staleness_threshold=0.1 -trigger_parameter_sync_step=16 +trigger_parameter_sync_step=4 +require_batches=4 partial_rollout=True python -m recipe.fully_async_policy.fully_async_main \ @@ -167,4 +159,6 @@ python -m recipe.fully_async_policy.fully_async_main \ rollout.test_freq="${test_freq}" \ async_training.staleness_threshold="${staleness_threshold}" \ async_training.trigger_parameter_sync_step="${trigger_parameter_sync_step}" \ - async_training.partial_rollout="${partial_rollout}" + async_training.require_batches="${require_batches}" \ + async_training.partial_rollout="${partial_rollout}" \ + async_training.use_rollout_log_probs=True \ No newline at end of file diff --git a/recipe/fully_async_policy/shell/dapo_7b_math_fsdp2_64_64.sh b/recipe/fully_async_policy/shell/dapo_7b_math_fsdp2_64_64.sh index 9e77ed3e567..c03e880eec8 100644 --- a/recipe/fully_async_policy/shell/dapo_7b_math_fsdp2_64_64.sh +++ b/recipe/fully_async_policy/shell/dapo_7b_math_fsdp2_64_64.sh @@ -16,11 +16,6 @@ CKPTS_DIR=${CKPTS_DIR:-"${RAY_DATA_HOME}/ckpts/${project_name}/${exp_name}"} TRAIN_FILE=${TRAIN_FILE:-"${RAY_DATA_HOME}/data/dapo-math-17k.parquet"} TEST_FILE=${TEST_FILE:-"${RAY_DATA_HOME}/data/aime-2024.parquet"} -MODEL_PATH=/mnt/dolphinfs/ssd_pool/docker/user/hadoop-friday-studio/FTI/houzhenggang/model/Qwen2___5-Math-7B -CKPTS_DIR=./ckpts/${project_name}/${exp_name} -TRAIN_FILE=/mnt/dolphinfs/ssd_pool/docker/user/hadoop-friday-studio/FTI/houzhenggang/data/dapo/dapo-math-17k.parquet -TEST_FILE=/mnt/dolphinfs/ssd_pool/docker/user/hadoop-friday-studio/FTI/houzhenggang/data/dapo/aime-2024.parquet - rollout_mode="async" rollout_name="vllm" # sglang or vllm if [ "$rollout_mode" = "async" ]; then @@ -63,7 +58,7 @@ ref_offload=True actor_offload=False gen_tp=4 sp_size=4 -fsdp_size=2 +fsdp_size=8 # Fully async specific parameters NNODES_ROLLOUT=${NNODES_ROLLOUT:-8} @@ -75,10 +70,11 @@ gen_prompt_bsz=1 n_resp_per_prompt=16 train_prompt_mini_bsz=32 total_rollout_steps=$(((512*400))) -test_freq=10 -staleness_threshold=0 -trigger_parameter_sync_step=16 -partial_rollout=False +test_freq=20 +staleness_threshold=0.1 +trigger_parameter_sync_step=4 +require_batches=4 +partial_rollout=True python -m recipe.fully_async_policy.fully_async_main \ data.train_files="${TRAIN_FILE}" \ @@ -161,4 +157,6 @@ python -m recipe.fully_async_policy.fully_async_main \ rollout.test_freq="${test_freq}" \ async_training.staleness_threshold="${staleness_threshold}" \ async_training.trigger_parameter_sync_step="${trigger_parameter_sync_step}" \ - async_training.partial_rollout="${partial_rollout}" + async_training.require_batches="${require_batches}" \ + async_training.partial_rollout="${partial_rollout}" \ + async_training.use_rollout_log_probs=True diff --git a/recipe/fully_async_policy/shell/dapo_7b_math_fsdp2_8_8.sh b/recipe/fully_async_policy/shell/dapo_7b_math_fsdp2_8_8.sh index 02f7664360f..ab9c98b1f4d 100644 --- a/recipe/fully_async_policy/shell/dapo_7b_math_fsdp2_8_8.sh +++ b/recipe/fully_async_policy/shell/dapo_7b_math_fsdp2_8_8.sh @@ -16,11 +16,6 @@ CKPTS_DIR=${CKPTS_DIR:-"${RAY_DATA_HOME}/ckpts/${project_name}/${exp_name}"} TRAIN_FILE=${TRAIN_FILE:-"${RAY_DATA_HOME}/data/dapo-math-17k.parquet"} TEST_FILE=${TEST_FILE:-"${RAY_DATA_HOME}/data/aime-2024.parquet"} -MODEL_PATH=/mnt/dolphinfs/ssd_pool/docker/user/hadoop-friday-studio/FTI/houzhenggang/model/Qwen2___5-Math-7B -CKPTS_DIR=./ckpts/${project_name}/${exp_name} -TRAIN_FILE=/mnt/dolphinfs/ssd_pool/docker/user/hadoop-friday-studio/FTI/houzhenggang/data/dapo/dapo-math-17k.parquet -TEST_FILE=/mnt/dolphinfs/ssd_pool/docker/user/hadoop-friday-studio/FTI/houzhenggang/data/dapo/aime-2024.parquet - rollout_mode="async" rollout_name="vllm" # sglang or vllm if [ "$rollout_mode" = "async" ]; then @@ -65,15 +60,11 @@ gen_tp=1 sp_size=1 fsdp_size=2 - # Fully async specific parameters NNODES_ROLLOUT=${NNODES_ROLLOUT:-1} NNODES_TRAIN=${NNODES_TRAIN:-1} NGPUS_PER_NODE=${NGPUS_PER_NODE:-8} -n_gpus_rollout=8 -n_gpus_training=8 - train_prompt_bsz=0 gen_prompt_bsz=1 n_resp_per_prompt=16 @@ -81,7 +72,8 @@ train_prompt_mini_bsz=32 total_rollout_steps=$(((512*100))) test_freq=10 staleness_threshold=0.1 -trigger_parameter_sync_step=16 +trigger_parameter_sync_step=4 +require_batches=4 partial_rollout=True python -m recipe.fully_async_policy.fully_async_main \ @@ -165,4 +157,6 @@ python -m recipe.fully_async_policy.fully_async_main \ rollout.test_freq="${test_freq}" \ async_training.staleness_threshold="${staleness_threshold}" \ async_training.trigger_parameter_sync_step="${trigger_parameter_sync_step}" \ - async_training.partial_rollout="${partial_rollout}" + async_training.require_batches="${require_batches}" \ + async_training.partial_rollout="${partial_rollout}" \ + async_training.use_rollout_log_probs=True \ No newline at end of file From 7cae5d5ed3f11ddb75b1189da7719eaa0bdedb68 Mon Sep 17 00:00:00 2001 From: ArronHZG Date: Fri, 17 Oct 2025 13:04:09 +0800 Subject: [PATCH 178/182] update readme --- docs/advance/fully_async.md | 2 +- recipe/fully_async_policy/README.md | 2 +- recipe/fully_async_policy/README_zh.md | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/docs/advance/fully_async.md b/docs/advance/fully_async.md index 6dac051922c..a3ad5e5cf0c 100644 --- a/docs/advance/fully_async.md +++ b/docs/advance/fully_async.md @@ -2,7 +2,7 @@ **Author:** `https://github.com/meituan-search` -Last updated: 10/16/2025. +Last updated: 10/17/2025. This document introduces a fully asynchronous PPO training system that completely decouples the Trainer and Rollouter, supporting asynchronous sample generation and training. diff --git a/recipe/fully_async_policy/README.md b/recipe/fully_async_policy/README.md index 6dac051922c..a3ad5e5cf0c 100644 --- a/recipe/fully_async_policy/README.md +++ b/recipe/fully_async_policy/README.md @@ -2,7 +2,7 @@ **Author:** `https://github.com/meituan-search` -Last updated: 10/16/2025. +Last updated: 10/17/2025. This document introduces a fully asynchronous PPO training system that completely decouples the Trainer and Rollouter, supporting asynchronous sample generation and training. diff --git a/recipe/fully_async_policy/README_zh.md b/recipe/fully_async_policy/README_zh.md index ea0e8c14679..fbbed992d4d 100644 --- a/recipe/fully_async_policy/README_zh.md +++ b/recipe/fully_async_policy/README_zh.md @@ -2,7 +2,7 @@ **Author:** `https://github.com/meituan-search` -Last updated: 10/16/2025. +Last updated: 10/17/2025. 本文档介绍了完全异步PPO训练系统,该系统实现了 Trainer 和 Rollouter 的完全解耦,支持异步样本生成和训练。 在该系统下,我们使用128卡训练qwen2.5-7B模型取得了2.35x-2.67x的性能提升,同时效果没有显著受到影响。 From 62fb0d0a263b510ccd18f88b500149ba32b651c2 Mon Sep 17 00:00:00 2001 From: ArronHZG Date: Fri, 17 Oct 2025 14:10:37 +0800 Subject: [PATCH 179/182] trigger ci --- recipe/fully_async_policy/README_zh.md | 1 + 1 file changed, 1 insertion(+) diff --git a/recipe/fully_async_policy/README_zh.md b/recipe/fully_async_policy/README_zh.md index fbbed992d4d..b30738dc4a3 100644 --- a/recipe/fully_async_policy/README_zh.md +++ b/recipe/fully_async_policy/README_zh.md @@ -361,6 +361,7 @@ TODO: 30B 的实验,还在完善中。 | fully_async_policy | 64:64 | async stream pipeline with partial rollout | | | | | | | + ## 后续计划 * GRPO实验 From fbae66a5c5358400b8ab8819cbcf88d73e2ed4b8 Mon Sep 17 00:00:00 2001 From: ArronHZG Date: Fri, 17 Oct 2025 14:18:00 +0800 Subject: [PATCH 180/182] trigger ci --- recipe/fully_async_policy/README_zh.md | 1 - 1 file changed, 1 deletion(-) diff --git a/recipe/fully_async_policy/README_zh.md b/recipe/fully_async_policy/README_zh.md index b30738dc4a3..fbbed992d4d 100644 --- a/recipe/fully_async_policy/README_zh.md +++ b/recipe/fully_async_policy/README_zh.md @@ -361,7 +361,6 @@ TODO: 30B 的实验,还在完善中。 | fully_async_policy | 64:64 | async stream pipeline with partial rollout | | | | | | | - ## 后续计划 * GRPO实验 From 0565a5523019fd25dc81afa99062f7961b5e78a0 Mon Sep 17 00:00:00 2001 From: ArronHZG Date: Fri, 17 Oct 2025 14:23:13 +0800 Subject: [PATCH 181/182] trigger ci --- recipe/fully_async_policy/README_zh.md | 1 + 1 file changed, 1 insertion(+) diff --git a/recipe/fully_async_policy/README_zh.md b/recipe/fully_async_policy/README_zh.md index fbbed992d4d..b30738dc4a3 100644 --- a/recipe/fully_async_policy/README_zh.md +++ b/recipe/fully_async_policy/README_zh.md @@ -361,6 +361,7 @@ TODO: 30B 的实验,还在完善中。 | fully_async_policy | 64:64 | async stream pipeline with partial rollout | | | | | | | + ## 后续计划 * GRPO实验 From dda6c5d66f8d14326e486e2ca6e63bf4bede06db Mon Sep 17 00:00:00 2001 From: ArronHZG Date: Fri, 17 Oct 2025 14:50:42 +0800 Subject: [PATCH 182/182] trigger ci --- recipe/fully_async_policy/README_zh.md | 1 - 1 file changed, 1 deletion(-) diff --git a/recipe/fully_async_policy/README_zh.md b/recipe/fully_async_policy/README_zh.md index b30738dc4a3..fbbed992d4d 100644 --- a/recipe/fully_async_policy/README_zh.md +++ b/recipe/fully_async_policy/README_zh.md @@ -361,7 +361,6 @@ TODO: 30B 的实验,还在完善中。 | fully_async_policy | 64:64 | async stream pipeline with partial rollout | | | | | | | - ## 后续计划 * GRPO实验