diff --git a/recipe/README.md b/recipe/README.md
index 7a4c4769d9a..a0bfc359222 100644
--- a/recipe/README.md
+++ b/recipe/README.md
@@ -1,6 +1,12 @@
# Recipe
-The examples under `recipes/` are representative extensions to verl for specific end-to-end RL training recipes.
-The help the community reproduce experiments, verl team provides a snapshot of the codebase when each recipe is initially PR'ed to verl main. You can find them via [github branches](https://github.com/volcengine/verl/branches/all?query=recipe)
+> Update 2025/11/25: recipes have been moved to a new repository: [verl-recipe](https://github.com/verl-project/verl-recipe).
+
+verl is designed to be a modular, extensible framework for post-training: SFT and RL. Recipe is expected to import verl as a library, with necessary extensions to build specific RL training pipeline. If you find verl can't meet recipe's requirements, please open an issue or PR to verl.
+
+There's still some incubation recipes kept here, which is expected to be offically supported in verl in the future.
+- fully_async_policy: fully asynchronous off-policy training with decoupled trainer and rollout.
+- transfer_queue: high performance asynchronous streaming data management system.
+- vla: VLA model RL training.
# Awesome work using verl
diff --git a/recipe/langgraph_agent/__init__.py b/recipe/langgraph_agent/__init__.py
deleted file mode 100644
index 1ce90c5eb35..00000000000
--- a/recipe/langgraph_agent/__init__.py
+++ /dev/null
@@ -1,13 +0,0 @@
-# Copyright 2024 Bytedance Ltd. and/or its affiliates
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
diff --git a/recipe/langgraph_agent/chat_model.py b/recipe/langgraph_agent/chat_model.py
deleted file mode 100644
index 0365a56a999..00000000000
--- a/recipe/langgraph_agent/chat_model.py
+++ /dev/null
@@ -1,393 +0,0 @@
-# Copyright 2024 Bytedance Ltd. and/or its affiliates
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""
-Ref: https://python.langchain.com/docs/how_to/custom_chat_model/
-"""
-
-import asyncio
-import json
-import logging
-import os
-import uuid
-from typing import Any, Optional
-
-from langchain_core.language_models import BaseChatModel
-from langchain_core.language_models.base import LanguageModelInput
-from langchain_core.messages import (
- AIMessage,
- BaseMessage,
- convert_to_openai_messages,
-)
-from langchain_core.messages.tool import InvalidToolCall, ToolCall
-from langchain_core.outputs import ChatGeneration, ChatResult
-from langchain_core.runnables import Runnable, RunnableConfig
-from langchain_core.tools import StructuredTool
-from langchain_core.utils.function_calling import convert_to_openai_tool
-from pydantic import Field
-
-from verl.experimental.agent_loop.agent_loop import AgentLoopOutput, AsyncLLMServerManager
-from verl.experimental.agent_loop.tool_parser import ToolParser
-from verl.experimental.agent_loop.utils import add_generation_prompt_for_gpt_oss, format_gpt_oss_tool_response_manually
-
-logger = logging.getLogger(__file__)
-logger.setLevel(os.getenv("VERL_LOGGING_LEVEL", "WARN"))
-
-
-class MaxTokenExceededError(Exception):
- """Indicate that history chat messages + tool message exceeds LLM max_tokens."""
-
- pass
-
-
-class ChatModel(BaseChatModel):
- model_name: str = Field(alias="model")
- """The name of the model"""
-
- client: AsyncLLMServerManager
- """AsyncLLM server manager"""
-
- tokenizer: Any
- """Tokenizer for the model"""
-
- max_tokens: int
- """Max tokens to generate"""
-
- tool_parser: str = "hermes"
- """Tool parser for the model"""
-
- max_parallel_calls: int = 1
- """Max parallel tool calls"""
-
- temperature: float = 1.0
- """Temperature for sampling"""
-
- top_p: float = 1.0
- """Top p for sampling"""
-
- repetition_penalty: float = 1.0
- """Repetition penalty for sampling"""
-
- def bind_tools(self, tools, **kwargs) -> Runnable[LanguageModelInput, BaseMessage]:
- """Bind tools to the model.
-
- Args:
- tools: Sequence of tools to bind to the model.
-
- Returns:
- A Runnable that returns a message.
- """
- formatted_tools: list = [convert_to_openai_tool(tool) for tool in tools]
-
- # used to remove system prompt prefix when encoding tool response
- system_prompt = self.tokenizer.apply_chat_template([{}], add_generation_prompt=False, tokenize=True)
- kwargs["system_prompt"] = system_prompt
-
- return self.bind(tools=formatted_tools, **kwargs)
-
- def with_structured_output(
- self,
- schema: dict | type,
- *,
- include_raw: bool = False,
- **kwargs: Any,
- ) -> Runnable[LanguageModelInput, dict | BaseChatModel]:
- """Ref: https://langchain-ai.github.io/langgraph/how-tos/react-agent-structured-output/"""
- raise NotImplementedError
-
- def _generate(
- self,
- messages: list[BaseMessage],
- stop: Optional[list[str]] = None,
- **kwargs: Any,
- ) -> ChatResult:
- raise NotImplementedError
-
- async def _agenerate(
- self,
- messages: list[BaseMessage],
- stop: Optional[list[str]] = None,
- **kwargs: Any,
- ) -> ChatResult:
- """Asynchronously generate chat completion message.
-
- Args:
- messages (list[BaseMessage]): List of list of messages.
- stop (Optional[list[str]], optional): Stop words to use when generating. Model output is cut off at the
- first occurrence of any of these substrings. Defaults to None.
-
- Returns:
- ChatResult: Chat result.
- """
- request_id, prompt_ids, response_mask = await self._preprocess(messages, **kwargs)
-
- sampling_params = {
- "temperature": self.temperature,
- "top_p": self.top_p,
- "repetition_penalty": self.repetition_penalty,
- }
- if "sampling_params" in kwargs:
- sampling_params.update(kwargs["sampling_params"])
-
- output = await self.client.generate(
- request_id=request_id, prompt_ids=prompt_ids, sampling_params=sampling_params
- )
-
- message = await self._postprocess(request_id, prompt_ids, response_mask, output.token_ids, **kwargs)
- generation = ChatGeneration(message=message)
- return ChatResult(generations=[generation])
-
- @property
- def _llm_type(self) -> str:
- """Get the type of language model used by this chat model."""
- return self.model_name
-
- async def _preprocess(self, messages: list[BaseMessage], **kwargs: Any) -> tuple[str, list[int], list[int]]:
- """Preprocess messages for chat completion.
-
- To ensure strong consistency with policy model, AsyncLLM server generate response with token in token out
- instead of messages list.
-
- But all agent frameworks use messages list to represent chat history. To mitigate the gap, we store trajectory
- (prompt_ids, response_mask) in lastest AIMessage.response_metadata.
-
- 1. Encode ToolMessage to token ids.
- 2. Retrieve trajectory (prompt_ids, response_mask) from lastest AIMessage.response_metadata.
- 3. Append ToolMessage token ids to prompt_ids, and append 0 to response_mask.
-
- Ref: https://python.langchain.com/docs/concepts/chat_history/
-
- Args:
- messages (list[BaseMessage]): List of messages.
-
- Returns:
- tuple[str, list[int], list[int]]: Request id, prompt ids, response mask.
- """
- # messages: [system], human, ai, human|tool, ai, human|tool, ...
- assert messages[-1].type in ["human", "tool"], (
- f"Last message must be human or tool, but got {messages[-1].type}"
- )
- loop = asyncio.get_running_loop()
-
- # Case 1: initial chat completion: [system], human
- if messages[-1].type == "human" and (len(messages) == 1 or messages[-2].type != "ai"):
- prompt_ids = await loop.run_in_executor(
- None,
- lambda: self.tokenizer.apply_chat_template(
- convert_to_openai_messages(messages),
- tools=kwargs.get("tools"),
- add_generation_prompt=True,
- tokenize=True,
- ),
- )
- return str(uuid.uuid4()), prompt_ids, []
-
- # Case 2: follow up chat completion with tool/human response: [system], human, ai, human|tool, ...
- for i in range(len(messages) - 1, -1, -1):
- if messages[i].type == "ai":
- break
- assert "prompt_ids" in messages[i].response_metadata, "Last message must have prompt_ids in response_metadata"
- assert "response_mask" in messages[i].response_metadata, (
- "Last message must have response_mask in response_metadata"
- )
-
- # encode tool response
- tool_responses = convert_to_openai_messages(messages[i + 1 :])
- if self.tool_parser == "hermes":
- tool_response_ids = await loop.run_in_executor(
- None,
- lambda messages=tool_responses: self.tokenizer.apply_chat_template(
- messages, add_generation_prompt=True, tokenize=True
- ),
- )
- tool_response_ids = tool_response_ids[len(kwargs["system_prompt"]) :]
- elif self.tool_parser == "gpt-oss":
- # Format tool responses manually
- # since gpt-oss chat template requires tool call messages to parse tool response messages
- # we need to format the tool response messages manually
- tool_response_texts = []
- for tool_msg in tool_responses:
- if tool_msg["role"] == "tool":
- # Use tool message's name if available (for multiple tool calls)
- actual_tool_name = tool_msg.get("name", "unknown")
- if actual_tool_name == "unknown":
- logger.error(f"actual_tool_name: {actual_tool_name}")
- formatted = format_gpt_oss_tool_response_manually(tool_msg["content"], actual_tool_name)
- tool_response_texts.append(formatted)
-
- # Tokenize the manually formatted tool responses
- tool_response_text = "".join(tool_response_texts)
- # need to add generation tokens for gpt-oss manually since add_generation_prompt is True
- tool_response_text = add_generation_prompt_for_gpt_oss(tool_response_text)
- logger.debug(f"tool_response_text: {tool_response_text}")
-
- tool_response_ids = await loop.run_in_executor(
- None, lambda: self.tokenizer.encode(tool_response_text, add_special_tokens=False)
- )
- else:
- raise ValueError(f"Unsupported tool parser: {self.tool_parser}")
-
- # stop generation if response length exceeds max response length
- if len(messages[i].response_metadata["response_mask"]) + len(tool_response_ids) >= self.max_tokens:
- raise MaxTokenExceededError(f"Max response length {self.max_tokens} exceeded")
-
- # append tool response to prompt
- request_id = messages[i].response_metadata.pop("request_id")
- prompt_ids = messages[i].response_metadata.pop("prompt_ids")
- response_mask = messages[i].response_metadata.pop("response_mask")
- prompt_ids += tool_response_ids
- response_mask += [0] * len(tool_response_ids)
-
- return request_id, prompt_ids, response_mask
-
- async def _postprocess(
- self, request_id: str, prompt_ids: list[int], response_mask: list[int], response_ids: list[int], **kwargs: Any
- ) -> AIMessage:
- """Postprocess response_ids when chat completion is done.
-
- 1. Decode response_ids, parse tool calls to AIMessage.
- 2. Append response_ids to prompt_ids, and append 1 to response_mask.
- 3. Store trajectory (prompt_ids, response_mask) in AIMessage.response_metadata.
-
- Args:
- request_id (str): Unique request id.
- prompt_ids (list[int]): Input prompt token ids in this chat completion.
- response_mask (list[int]): Response mask before this chat completion.
- response_ids (list[int]): LLM generated token ids in this chat completion.
-
- Returns:
- AIMessage: Postprocessed message.
- """
- prompt_ids += response_ids
- response_mask += [1] * len(response_ids)
-
- tool_parser = ToolParser.get_tool_parser(self.tool_parser, self.tokenizer)
- content, function_calls = await tool_parser.extract_tool_calls(response_ids)
-
- tool_calls, invalid_tool_calls = [], []
-
- for function_call in function_calls:
- error = None
- try:
- args = json.loads(function_call.arguments)
- if not isinstance(args, dict):
- error = f"Tool arguments must be a JSON object, got {type(args).__name__}"
- except json.JSONDecodeError as e:
- error = f"Invalid JSON tool arguments: {e}"
-
- if error:
- logger.warning(error)
- invalid_tool_calls.append(
- InvalidToolCall(
- name=function_call.name,
- args=function_call.arguments,
- id=str(uuid.uuid4()),
- error=error,
- )
- )
- else:
- tool_calls.append(
- ToolCall(
- name=function_call.name,
- args=args,
- id=str(uuid.uuid4()),
- )
- )
-
- message = AIMessage(
- content=content,
- tool_calls=tool_calls[: self.max_parallel_calls],
- invalid_tool_calls=invalid_tool_calls[: self.max_parallel_calls],
- response_metadata={
- "request_id": request_id,
- "prompt_ids": prompt_ids,
- "response_mask": response_mask,
- },
- )
- return message
-
-
-class TruncateStructuredTool(StructuredTool):
- """Structured tool with response truncation."""
-
- tool_response_truncate_side: str
- """truncate side of tool response: left, middle, right"""
-
- max_tool_response_length: int
- """max length of tool response"""
-
- async def _arun(
- self,
- *args: Any,
- config: RunnableConfig,
- **kwargs: Any,
- ) -> Any:
- tool_response = await super()._arun(*args, config=config, **kwargs)
- tool_response = str(tool_response)
-
- if len(tool_response) > self.max_tool_response_length:
- if self.tool_response_truncate_side == "left":
- tool_response = tool_response[: self.max_tool_response_length] + "...(truncated)"
- elif self.tool_response_truncate_side == "right":
- tool_response = "(truncated)..." + tool_response[-self.max_tool_response_length :]
- else:
- length = self.max_tool_response_length // 2
- tool_response = tool_response[:length] + "...(truncated)..." + tool_response[-length:]
-
- return tool_response
-
-
-def convert_to_agent_output(messages: list[BaseMessage], response_length: int) -> AgentLoopOutput:
- """Convert messages to AgentLoopOutput.
-
- Args:
- messages (List[BaseMessage]): List of messages, last message must be assistant
- with response_metadata containing `prompt_ids` and `response_mask`.
- response_length (int): Max length of response.
-
- Returns:
- AgentLoopOutput: agent loop output trajectory used for training.
- """
- # skip last tool calls
- for i in range(len(messages) - 1, -1, -1):
- if messages[i].type != "tool":
- break
- last_message = messages[i]
- assert last_message.type == "ai", f"Last message must be assistant, but got {last_message.type}"
- assert "prompt_ids" in last_message.response_metadata, "Last message must have prompt_ids in response_metadata"
- assert "response_mask" in last_message.response_metadata, (
- "Last message must have response_mask in response_metadata"
- )
-
- num_turns = 0
- for i in range(len(messages)):
- if messages[i].type == "system":
- continue
- # parallel tool calls are in single turn
- if i == 0 or messages[i].type != messages[i - 1].type:
- num_turns += 1
-
- prompt_ids = last_message.response_metadata["prompt_ids"]
- response_mask = last_message.response_metadata["response_mask"]
-
- response_ids = prompt_ids[-len(response_mask) :]
- prompt_ids = prompt_ids[: len(prompt_ids) - len(response_mask)]
-
- output = AgentLoopOutput(
- prompt_ids=prompt_ids,
- response_ids=response_ids[:response_length],
- response_mask=response_mask[:response_length],
- num_turns=num_turns,
- metrics={},
- )
- return output
diff --git a/recipe/langgraph_agent/example/README.md b/recipe/langgraph_agent/example/README.md
deleted file mode 100644
index 4540c51b4e9..00000000000
--- a/recipe/langgraph_agent/example/README.md
+++ /dev/null
@@ -1,138 +0,0 @@
-# MathExpression: LangGraph Agent Example
-
-MathExpression is a tiny example to demonstrate multi-turn rollout with [LangGraph ReactAgent](https://langchain-ai.github.io/langgraph/agents/overview/).
-
-### Define react agent with tool
-Firstly, to force ReactAgent to evaluate math expression by tool, we define a special operand `@`:
-```python
-@tool(parse_docstring=True)
-def calculate(a: int, b: int, operand: str) -> int:
- """
- Compute the results using operand with two integers
-
- Args:
- a: the first operand
- b: the second operand
- operand: '+' or '-' or '*' or '@'
- """
- assert operand in ["+", "-", "*", "@"], f"unknown operand {operand}"
- if operand == "@":
- return 3 * a - 2 * b
- return eval(f"{a} {operand} {b}")
-```
-
-Without calling `calculate`, ReactAgent is impossible to evaluate math expression correctly.
-
-Then, we can equip ReactAgent with `calculate` tool:
-```python
-class MathExpressionReactAgentLoop(ReactAgentLoop):
- @classmethod
- def init_class(cls, config, tokenizer):
- cls.tools = [calculate]
- super().init_class(config, tokenizer)
-```
-
-We can define agent loop config in yaml file, which will be used by AgentLoopWorker to dynamic load custom AgentLoop class.
-```yaml
-- name: math_expression
- _target_: recipe.langgraph_agent.example.math_expression.MathExpressionReactAgentLoop
-```
-
-### Prepare dataset
-Now, let's prepare two small datasets for training and evaluation:
-```bash
-python recipe/langgraph_agent/example/create_dataset.py
-```
-
-- Parameters: `--train_size` (default: 5000), `--test_size` (default: 500), `--output_dir` (default: `data/math_expression_tool`).
-- Example with custom sizes/output:
-```bash
-python recipe/langgraph_agent/example/create_dataset.py \
- --train_size 10000 \
- --test_size 1000 \
- --output_dir data/math_expression_tool
-```
-
-Note that dataset should contain a column `agent_name` with `math_expression`, which is used by `AgentLoopWorker` to select the
-agent loop class.
-| prompt | reward_model | agent_name |
-|--------------------------------------|------------------------------|-----------------|
-| [{'role': 'user', 'content': '...'}] | {'ground_truth': '-10', ...} | math_expression |
-| [{'role': 'user', 'content': '...'}] | {'ground_truth': '-10', ...} | math_expression |
-
-Generated math expressions are like below, requiring model to call `calculate` multiple times to solve sub expressions.
-```
-(2 @ (8 @ 8 @ 5 @ 5 @ 3) @ 6 @ (1 @ 4 @ 4 @ 4) @ 2) @ 6
-(4.6 @ (9.05 @ 4.0) @ 8.3 @ 1.21) @ 8.6
-9 @ 4
-((2 @ 2) @ (3 @ 3)) @ 4
-```
-
-### Training
-Hook all these up and start training:
-```bash
-bash recipe/langgraph_agent/example/run_qwen2.5_3b.sh 2>&1 | tee train.log
-```
-
-To submit on a SLURM cluster (the script contains SBATCH headers):
-```bash
-sbatch recipe/langgraph_agent/example/run_qwen2.5_3b.sh
-```
-
-**Note on `GPUS_PER_NODE` and `NNODES`:**
-
-- `GPUS_PER_NODE`: GPUs per node.
- Detection order: `SLURM_GPUS_ON_NODE` (if set) → `GPUS_PER_NODE` → `2`.
-- `NNODES`: number of nodes.
- Detection order: `SLURM_JOB_NUM_NODES` (if set) → `NNODES` → `1`.
-- Total GPUs = `GPUS_PER_NODE × NNODES` (must be ≥ 2).
-
-Local override (no `SLURM_*` set):
-```bash
-GPUS_PER_NODE=4 NNODES=2 bash recipe/langgraph_agent/example/run_qwen2.5_3b.sh
-```
-
-After total 39 steps, model should achieve 100% accuray on test dataset:
-- val-aux/lighteval/MATH/reward: 1.0
-- val-aux/num_turns/mean: 9.0, average number of messages include assistant and tool turns.
-
-We can see model can solve math expression by calling `calculate` tool:
-```
-system
-You are Qwen, created by Alibaba Cloud. You are a helpful assistant.
-user
-We define a new math operator @, where you can only call an external tool to compute. Please put your final answer inside \boxed{} only in the last turn. Now answer the following questions:
-Compute 5 @ (3 @ 6 @ 8)
-assistant
-Sure, let's break this down step by step. We need to compute the expression \(5 @ (3 @ 6 @ 8)\). First, we need to compute the inner expression \(3 @ 6 @ 8\).
-
-Let's compute \(3 @ 6\) first using the external tool.
-
-{"name": "calculate", "arguments": {"a": 3, "b": 6, "operand": "@"}}
-user
-
--3
-
-assistant
-We have computed that \(3 @ 6 = -3\). Now we need to compute \(-3 @ 8\).
-
-Let's use the external tool to compute this.
-
-{"name": "calculate", "arguments": {"a": -3, "b": 8, "operand": "@"}}
-user
-
--25
-
-assistant
-We have computed that \(-3 @ 8 = -25\). Now we can put this result into the original expression to get the final result of \(5 @ (3 @ 6 @ 8)\), which is \(5 @ (-25)\).
-
-Let's compute \(5 @ (-25)\) using the external tool.
-
-{"name": "calculate", "arguments": {"a": 5, "b": -25, "operand": "@"}}
-user
-
-65
-
-assistant
-The final result of \(5 @ (3 @ 6 @ 8)\) is \(\boxed{65}\).
-```
diff --git a/recipe/langgraph_agent/example/agent.yaml b/recipe/langgraph_agent/example/agent.yaml
deleted file mode 100644
index cbd8fb9ebb9..00000000000
--- a/recipe/langgraph_agent/example/agent.yaml
+++ /dev/null
@@ -1,2 +0,0 @@
-- name: math_expression
- _target_: recipe.langgraph_agent.example.math_expression.MathExpressionReactAgentLoop
diff --git a/recipe/langgraph_agent/example/create_dataset.py b/recipe/langgraph_agent/example/create_dataset.py
deleted file mode 100644
index 45ce131f83f..00000000000
--- a/recipe/langgraph_agent/example/create_dataset.py
+++ /dev/null
@@ -1,290 +0,0 @@
-# Copyright 2024 Bytedance Ltd. and/or its affiliates
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""
-Create dataset for calculator
-"""
-
-import argparse
-import os
-import random
-
-import pandas as pd
-
-
-def generate_math_expression(min_terms=2, max_terms=5, min_number=1, max_number=10, allow_decimals=False, max_depth=2):
- """
- Generate a random mathematical expression with operators +, -, *, /, and parentheses.
-
- Args:
- min_terms (int): Minimum number of terms in the expression.
- max_terms (int): Maximum number of terms in the expression.
- max_number (int): Maximum value for numbers in the expression.
- allow_decimals (bool): Whether to allow decimal numbers.
- max_depth (int): Maximum nesting depth for parentheses.
-
- Returns:
- str: A valid mathematical expression as a string.
- """
-
- def generate_number():
- """Generate a random number (integer or float)."""
- assert min_number < max_number
- num = random.uniform(min_number, max_number)
- if not allow_decimals:
- num = int(num)
- else:
- num = round(num, random.randint(0, 2)) # Round to 0-2 decimal places
- return str(num)
-
- def generate_term(depth=0):
- """Generate a term (number or parenthesized expression)."""
- if depth < max_depth and random.random() < 0.5: # 50% chance to add parentheses
- expr = generate_expression(depth + 1)
- return f"({expr})"
- else:
- return generate_number()
-
- def generate_expression(depth=0):
- """Generate a full expression with multiple terms and operators."""
- num_terms = random.randint(min_terms, max_terms)
- terms = [generate_term(depth) for _ in range(num_terms)]
-
- # Randomly select operators
- operators = ["+", "-", "*", "/", "@"]
- expr = terms[0]
-
- for i in range(1, num_terms):
- # Bias towards + and - for readability
- op = random.choices(
- operators,
- weights=[0, 0, 0, 0, 1], # + and - are 1.5x more likely than * and /
- )[0]
- expr += f" {op} " + terms[i]
-
- return expr
-
- return generate_expression()
-
-
-def test():
- # Example 1: Basic integer expression
- print(generate_math_expression())
- # Output: (3 + 7) * 2 - 5
-
- # Example 2: Expression with decimals
- print(generate_math_expression(allow_decimals=True))
- # Output: 4.5 / (2.1 + 3.7) - 1.2
-
- # Example 3: More complex expression with higher depth
- print(generate_math_expression(max_terms=6, max_depth=3))
- # Output: ((5 * 2) - (3 + 1)) / (7 - 2) + 4
-
- # Example 4: Simplified expression
- print(generate_math_expression(min_terms=2, max_terms=3, max_number=5))
- # Output: 4 - 2 * 3
-
-
-def calculate(expression: str) -> float:
- """
- Evaluate a mathematical expression with +, -, *, /, @, and parentheses.
- The @ operator is defined as: a @ b = 3a - 2b.
-
- Args:
- expression (str): Input mathematical expression (e.g., "3@2+4").
-
- Returns:
- float: Result of the evaluated expression.
-
- Raises:
- ValueError: For invalid expressions (e.g., mismatched parentheses, division by zero).
- """
-
- def tokenize(s: str) -> list:
- """Convert the input string into tokens (numbers, operators, parentheses)."""
- tokens = []
- i = 0
- while i < len(s):
- if s[i].isdigit() or s[i] == ".":
- # Parse number (integer or float)
- j = i
- while j < len(s) and (s[j].isdigit() or s[j] == "."):
- j += 1
- tokens.append(s[i:j])
- i = j
- elif s[i] in "+-*/@()":
- # Operator or parenthesis
- tokens.append(s[i])
- i += 1
- elif s[i].isspace():
- # Skip whitespace
- i += 1
- else:
- raise ValueError(f"Invalid character: {s[i]}")
- return tokens
-
- def infix_to_postfix(tokens: list) -> list:
- """Convert infix notation to postfix notation (Reverse Polish Notation)."""
- output = []
- stack = []
- # Higher precedence for @ (between * and +)
- precedence = {"@": 3, "*": 2, "/": 2, "+": 1, "-": 1}
-
- for token in tokens:
- if token.isdigit() or "." in token:
- output.append(token)
- elif token == "(":
- stack.append(token)
- elif token == ")":
- while stack and stack[-1] != "(":
- output.append(stack.pop())
- if not stack or stack[-1] != "(":
- raise ValueError("Mismatched parentheses")
- stack.pop() # Discard '('
- else: # Operator
- while stack and stack[-1] != "(" and precedence.get(stack[-1], 0) >= precedence.get(token, 0):
- output.append(stack.pop())
- stack.append(token)
-
- # Pop remaining operators
- while stack:
- if stack[-1] in "()":
- raise ValueError("Mismatched parentheses")
- output.append(stack.pop())
-
- return output
-
- def evaluate_postfix(postfix: list) -> float:
- """Evaluate postfix expression using a stack."""
- stack = []
- for token in postfix:
- if token.isdigit() or "." in token:
- stack.append(float(token))
- else:
- if len(stack) < 2:
- raise ValueError("Invalid expression")
- b = stack.pop()
- a = stack.pop()
- if token == "+":
- res = a + b
- elif token == "-":
- res = a - b
- elif token == "*":
- res = a * b
- elif token == "/":
- if b == 0:
- raise ValueError("Division by zero")
- res = a / b
- elif token == "@":
- res = 3 * a - 2 * b # Custom @ operator implementation
- else:
- raise ValueError(f"Invalid operator: {token}")
- stack.append(res)
-
- if len(stack) != 1:
- raise ValueError("Invalid expression")
- return stack[0]
-
- # Remove spaces and validate parentheses
- expression = expression.replace(" ", "")
- if expression.count("(") != expression.count(")"):
- raise ValueError("Mismatched parentheses")
-
- tokens = tokenize(expression)
- postfix = infix_to_postfix(tokens)
- result = evaluate_postfix(postfix)
-
- # Convert integers to integer representation
- if result.is_integer():
- return int(result)
- return result
-
-
-def generate_data(total_num_dataset, split, agent_name="math_expression"):
- rl_dataset = {
- "prompt": [],
- "data_source": [],
- "ability": [],
- "reward_model": [],
- "extra_info": [],
- "agent_name": [],
- }
-
- for idx in range(total_num_dataset):
- while True:
- try:
- expression: str = generate_math_expression(
- min_terms=2, max_terms=3, min_number=1, max_number=10, allow_decimals=False, max_depth=1
- )
-
- num_plus = expression.count("+")
- num_minus = expression.count("-")
- num_mul = expression.count("*")
- num_star = expression.count("@")
-
- answer = str(calculate(expression))
- # answer = str(eval(expression))
- break
- except Exception as e:
- print(e)
- continue
-
- num_tool_calls = num_plus + num_minus + num_mul + num_star
-
- prompt = (
- f"We define a new math operator @, where you can only call an external tool to compute. "
- f"Please put your final answer inside \\boxed{{}} only in the last turn. Now answer the "
- f"following questions:\nCompute {expression}"
- )
- prompt_with_template = [
- {
- "role": "user",
- "content": prompt,
- }
- ]
-
- rl_dataset["prompt"].append(prompt_with_template)
- rl_dataset["data_source"].append("lighteval/MATH")
- rl_dataset["ability"].append("math")
- rl_dataset["reward_model"].append({"style": "lighteval/MATH", "ground_truth": answer})
- rl_dataset["extra_info"].append(
- {"index": idx, "expression": expression, "split": split, "expected_tool_calls": num_tool_calls}
- )
- rl_dataset["agent_name"].append(agent_name)
-
- rl_dataset = pd.DataFrame(data=rl_dataset)
- return rl_dataset
-
-
-if __name__ == "__main__":
- parser = argparse.ArgumentParser(description="Math Expression Dataset Generator")
- parser.add_argument("--train_size", type=int, default=5000, help="Number of training samples")
- parser.add_argument("--test_size", type=int, default=500, help="Number of testing samples")
- parser.add_argument("--output_dir", default="data/math_expression_tool", help="Directory to save the dataset")
- parser.add_argument("--agent_name", default="math_expression", help="Name of the agent")
- args = parser.parse_args()
-
- # print(calculate("3@2")) # Output: 5 (3*3 - 2*2)
- # print(calculate("3@2+4")) # Output: 9 (5 + 4)
- # print(calculate("3*(4@2)")) # Output: 24 (3 * 8)
- # print(calculate("(5@3)*2")) # Output: 18 (9 * 2)
-
- train_dataset = generate_data(total_num_dataset=args.train_size, split="train", agent_name=args.agent_name)
- test_dataset = generate_data(total_num_dataset=args.test_size, split="test", agent_name=args.agent_name)
-
- # Make sure the dataset directory exists
- os.makedirs(args.output_dir, exist_ok=True)
-
- # Save the datasets to parquet files
- train_dataset.to_parquet(os.path.join(args.output_dir, "train.parquet"))
- test_dataset.to_parquet(os.path.join(args.output_dir, "test.parquet"))
diff --git a/recipe/langgraph_agent/example/math_expression.py b/recipe/langgraph_agent/example/math_expression.py
deleted file mode 100644
index 4532c8af3c4..00000000000
--- a/recipe/langgraph_agent/example/math_expression.py
+++ /dev/null
@@ -1,39 +0,0 @@
-# Copyright 2024 Bytedance Ltd. and/or its affiliates
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-from langchain_core.tools import tool
-
-from recipe.langgraph_agent.react_agent_loop import ReactAgentLoop
-
-
-@tool(parse_docstring=True)
-def calculate(a: int, b: int, operand: str) -> int:
- """
- Compute the results using operand with two integers
-
- Args:
- a: the first operand
- b: the second operand
- operand: '+' or '-' or '*' or '@'
- """
- assert operand in ["+", "-", "*", "@"], f"unknown operand {operand}"
- if operand == "@":
- return 3 * a - 2 * b
- return eval(f"{a} {operand} {b}")
-
-
-class MathExpressionReactAgentLoop(ReactAgentLoop):
- @classmethod
- def init_class(cls, config, tokenizer, **kwargs):
- cls.tools = [calculate]
- super().init_class(config, tokenizer)
diff --git a/recipe/langgraph_agent/example/run_gpt_oss_20b_bf16.sh b/recipe/langgraph_agent/example/run_gpt_oss_20b_bf16.sh
deleted file mode 100644
index 9abd7b0105f..00000000000
--- a/recipe/langgraph_agent/example/run_gpt_oss_20b_bf16.sh
+++ /dev/null
@@ -1,143 +0,0 @@
-#!/usr/bin/env bash
-#SBATCH --job-name=rl-langgraph-3B
-#SBATCH --partition=main
-#SBATCH --nodes=1
-#SBATCH --ntasks-per-node=1
-#SBATCH --cpus-per-task=64
-#SBATCH --gres=gpu:4
-#SBATCH --mem=0
-#SBATCH --time=10:00:00
-#SBATCH --output=%x_%j.out
-#SBATCH --error=%x_%j.err
-
-set -xeuo pipefail
-
-# ================= cluster topology =================
-export GPUS_PER_NODE=${SLURM_GPUS_ON_NODE:-${GPUS_PER_NODE:-2}} # GPUs on this node
-NNODES=${SLURM_JOB_NUM_NODES:-${NNODES:-1}}
-export NNODES
-export RAY_NUM_NODES=$NNODES
-
-# Require at least 2 GPUs
-TOTAL_GPUS=$((GPUS_PER_NODE * NNODES))
-if [ "$TOTAL_GPUS" -lt 2 ]; then
- echo "Error: at least 2 GPUs are required, detected $TOTAL_GPUS." >&2
- exit 1
-fi
-
-echo "Using $NNODES nodes and $GPUS_PER_NODE GPUs per node..."
-
-# ================= data/model/tool =================
-HDFS_ROOT=${HDFS_ROOT:-$PWD}
-DATA_ROOT=${DATA_ROOT:-$PWD}
-
-# Prefer local model if present, otherwise fall back to HF hub path
-model_path="lmsys/gpt-oss-20b-bf16"
-
-# Use the default output directory produced by create_dataset.py
-train_files=$DATA_ROOT/data/math_expression_tool/train.parquet
-test_files=$DATA_ROOT/data/math_expression_tool/test.parquet
-
-# Agent config
-agent_loop_config_path=recipe/langgraph_agent/example/agent.yaml
-
-# =================== wandb ===================
-project_name=math_expression_tool
-experiment_name=gpt-oss-20b-bf16
-default_local_dir=$DATA_ROOT/checkpoint/$experiment_name
-
-# ================= algorithm =================
-adv_estimator=grpo
-
-use_kl_in_reward=false
-kl_coef=0.0
-use_kl_loss=false
-kl_loss_coef=0.0
-
-clip_ratio_low=0.2
-clip_ratio_high=0.28
-
-max_turns=8
-max_prompt_length=1024
-max_response_length=8192
-actor_lr=1e-6
-
-train_batch_size=128
-ppo_mini_batch_size=16
-n_resp_per_prompt=8
-n_resp_per_prompt_val=1
-
-# =================== logging ===================
-export RAY_LOGGING_LEVEL=DEBUG
-export HYDRA_FULL_ERROR=1
-
-# ================= performance =================
-export NCCL_IBEXT_DISABLE=1
-export NCCL_NVLS_ENABLE=1
-export NCCL_IB_HCA=mlx5
-export UCX_NET_DEVICES=mlx5_0:1,mlx5_1:1,mlx5_2:1,mlx5_3:1,mlx5_4:1,mlx5_5:1,mlx5_6:1,mlx5_7:1
-export VLLM_USE_V1=1
-export VLLM_ATTENTION_BACKEND=FLASH_ATTN
-
-infer_tp=2 # vLLM tensor parallel size
-train_sp=4 # Ulysses sequence parallel size for actor
-offload=true
-
-actor_max_token_len_per_gpu=$(( (max_prompt_length + max_response_length) * 4 ))
-log_prob_max_token_len_per_gpu=$(( actor_max_token_len_per_gpu * 2 ))
-
-train_files="['$train_files']"
-test_files="['$test_files']"
-
-python3 -m verl.trainer.main_ppo \
- algorithm.adv_estimator=$adv_estimator \
- algorithm.use_kl_in_reward=$use_kl_in_reward \
- algorithm.kl_ctrl.kl_coef=$kl_coef \
- data.train_files="$train_files" \
- data.val_files="$test_files" \
- data.return_raw_chat=true \
- data.train_batch_size=$train_batch_size \
- data.max_prompt_length=$max_prompt_length \
- data.max_response_length=$max_response_length \
- data.filter_overlong_prompts=true \
- data.truncation='error' \
- actor_rollout_ref.model.path="$model_path" \
- actor_rollout_ref.model.use_remove_padding=true \
- actor_rollout_ref.model.enable_gradient_checkpointing=true \
- actor_rollout_ref.actor.use_kl_loss=$use_kl_loss \
- actor_rollout_ref.actor.kl_loss_coef=$kl_loss_coef \
- actor_rollout_ref.actor.clip_ratio_low=$clip_ratio_low \
- actor_rollout_ref.actor.clip_ratio_high=$clip_ratio_high \
- actor_rollout_ref.actor.clip_ratio_c=10.0 \
- actor_rollout_ref.actor.optim.lr=$actor_lr \
- actor_rollout_ref.actor.use_dynamic_bsz=true \
- actor_rollout_ref.actor.ppo_mini_batch_size=$ppo_mini_batch_size \
- actor_rollout_ref.actor.ppo_max_token_len_per_gpu=$actor_max_token_len_per_gpu \
- actor_rollout_ref.actor.ulysses_sequence_parallel_size=$train_sp \
- actor_rollout_ref.actor.fsdp_config.param_offload=$offload \
- actor_rollout_ref.actor.fsdp_config.optimizer_offload=$offload \
- actor_rollout_ref.ref.log_prob_max_token_len_per_gpu=$log_prob_max_token_len_per_gpu \
- actor_rollout_ref.rollout.name=sglang \
- actor_rollout_ref.rollout.mode=async \
- actor_rollout_ref.rollout.tensor_model_parallel_size=$infer_tp \
- actor_rollout_ref.rollout.multi_turn.max_user_turns=$max_turns \
- actor_rollout_ref.rollout.multi_turn.max_assistant_turns=$max_turns \
- actor_rollout_ref.rollout.multi_turn.format=gpt-oss \
- +actor_rollout_ref.rollout.engine_kwargs.sglang.attention_backend=triton \
- actor_rollout_ref.rollout.agent.agent_loop_config_path=$agent_loop_config_path \
- actor_rollout_ref.rollout.gpu_memory_utilization=0.7 \
- actor_rollout_ref.rollout.n=$n_resp_per_prompt \
- actor_rollout_ref.rollout.val_kwargs.top_p=1.0\
- actor_rollout_ref.rollout.val_kwargs.temperature=1.0 \
- actor_rollout_ref.rollout.val_kwargs.n=$n_resp_per_prompt_val \
- trainer.logger='["console","wandb"]' \
- trainer.project_name=$project_name \
- trainer.experiment_name=$experiment_name \
- trainer.n_gpus_per_node="$GPUS_PER_NODE" \
- trainer.val_before_train=true \
- trainer.log_val_generations=50 \
- trainer.nnodes="$NNODES" \
- trainer.save_freq=-1 \
- trainer.default_local_dir="$default_local_dir" \
- trainer.test_freq=5 \
- trainer.total_epochs=1 "$@"
\ No newline at end of file
diff --git a/recipe/langgraph_agent/example/run_qwen2.5_3b.sh b/recipe/langgraph_agent/example/run_qwen2.5_3b.sh
deleted file mode 100644
index 4e4cc020ae0..00000000000
--- a/recipe/langgraph_agent/example/run_qwen2.5_3b.sh
+++ /dev/null
@@ -1,145 +0,0 @@
-#!/usr/bin/env bash
-#SBATCH --job-name=rl-langgraph-3B
-#SBATCH --partition=main
-#SBATCH --nodes=1
-#SBATCH --ntasks-per-node=1
-#SBATCH --cpus-per-task=64
-#SBATCH --gres=gpu:4
-#SBATCH --mem=0
-#SBATCH --time=10:00:00
-#SBATCH --output=%x_%j.out
-#SBATCH --error=%x_%j.err
-
-set -xeuo pipefail
-
-# ================= cluster topology =================
-export GPUS_PER_NODE=${SLURM_GPUS_ON_NODE:-${GPUS_PER_NODE:-2}} # GPUs on this node
-NNODES=${SLURM_JOB_NUM_NODES:-${NNODES:-1}}
-export NNODES
-export RAY_NUM_NODES=$NNODES
-
-# Require at least 2 GPUs
-TOTAL_GPUS=$((GPUS_PER_NODE * NNODES))
-if [ "$TOTAL_GPUS" -lt 2 ]; then
- echo "Error: at least 2 GPUs are required, detected $TOTAL_GPUS." >&2
- exit 1
-fi
-
-echo "Using $NNODES nodes and $GPUS_PER_NODE GPUs per node..."
-
-# ================= data/model/tool =================
-HDFS_ROOT=${HDFS_ROOT:-$PWD}
-DATA_ROOT=${DATA_ROOT:-$PWD}
-
-# Prefer local model if present, otherwise fall back to HF hub path
-model_path=${model_path:-$DATA_ROOT/model/Qwen2.5-3B-Instruct}
-if [ ! -d "$model_path" ]; then
- model_path=Qwen/Qwen2.5-3B-Instruct
-fi
-
-# Use the default output directory produced by create_dataset.py
-train_files=$DATA_ROOT/data/math_expression_tool/train.parquet
-test_files=$DATA_ROOT/data/math_expression_tool/test.parquet
-
-# Agent config
-agent_loop_config_path=recipe/langgraph_agent/example/agent.yaml
-
-# =================== wandb ===================
-project_name=math_expression_tool
-experiment_name=qwen2.5-3b
-default_local_dir=$DATA_ROOT/checkpoint/$experiment_name
-
-# ================= algorithm =================
-adv_estimator=grpo
-
-use_kl_in_reward=false
-kl_coef=0.0
-use_kl_loss=false
-kl_loss_coef=0.0
-
-clip_ratio_low=0.2
-clip_ratio_high=0.28
-
-max_turns=8
-max_prompt_length=1024
-max_response_length=2048
-actor_lr=1e-6
-
-train_batch_size=128
-ppo_mini_batch_size=16
-n_resp_per_prompt=8
-n_resp_per_prompt_val=1
-
-# =================== logging ===================
-export RAY_LOGGING_LEVEL=DEBUG
-export HYDRA_FULL_ERROR=1
-
-# ================= performance =================
-export NCCL_IBEXT_DISABLE=1
-export NCCL_NVLS_ENABLE=1
-export NCCL_IB_HCA=mlx5
-export UCX_NET_DEVICES=mlx5_0:1,mlx5_1:1,mlx5_2:1,mlx5_3:1,mlx5_4:1,mlx5_5:1,mlx5_6:1,mlx5_7:1
-export VLLM_USE_V1=1
-export VLLM_ATTENTION_BACKEND=FLASH_ATTN
-
-infer_tp=2 # vLLM tensor parallel size
-train_sp=4 # Ulysses sequence parallel size for actor
-offload=true
-
-actor_max_token_len_per_gpu=$(( (max_prompt_length + max_response_length) * 4 ))
-log_prob_max_token_len_per_gpu=$(( actor_max_token_len_per_gpu * 2 ))
-
-train_files="['$train_files']"
-test_files="['$test_files']"
-
-python3 -m verl.trainer.main_ppo \
- algorithm.adv_estimator=$adv_estimator \
- algorithm.use_kl_in_reward=$use_kl_in_reward \
- algorithm.kl_ctrl.kl_coef=$kl_coef \
- data.train_files="$train_files" \
- data.val_files="$test_files" \
- data.return_raw_chat=true \
- data.train_batch_size=$train_batch_size \
- data.max_prompt_length=$max_prompt_length \
- data.max_response_length=$max_response_length \
- data.filter_overlong_prompts=true \
- data.truncation='error' \
- actor_rollout_ref.model.path="$model_path" \
- actor_rollout_ref.model.use_remove_padding=true \
- actor_rollout_ref.model.enable_gradient_checkpointing=true \
- actor_rollout_ref.actor.use_kl_loss=$use_kl_loss \
- actor_rollout_ref.actor.kl_loss_coef=$kl_loss_coef \
- actor_rollout_ref.actor.clip_ratio_low=$clip_ratio_low \
- actor_rollout_ref.actor.clip_ratio_high=$clip_ratio_high \
- actor_rollout_ref.actor.clip_ratio_c=10.0 \
- actor_rollout_ref.actor.optim.lr=$actor_lr \
- actor_rollout_ref.actor.use_dynamic_bsz=true \
- actor_rollout_ref.actor.ppo_mini_batch_size=$ppo_mini_batch_size \
- actor_rollout_ref.actor.ppo_max_token_len_per_gpu=$actor_max_token_len_per_gpu \
- actor_rollout_ref.actor.ulysses_sequence_parallel_size=$train_sp \
- actor_rollout_ref.actor.fsdp_config.param_offload=$offload \
- actor_rollout_ref.actor.fsdp_config.optimizer_offload=$offload \
- actor_rollout_ref.ref.log_prob_max_token_len_per_gpu=$log_prob_max_token_len_per_gpu \
- actor_rollout_ref.rollout.name=vllm \
- actor_rollout_ref.rollout.mode=async \
- actor_rollout_ref.rollout.tensor_model_parallel_size=$infer_tp \
- actor_rollout_ref.rollout.multi_turn.max_user_turns=$max_turns \
- actor_rollout_ref.rollout.multi_turn.max_assistant_turns=$max_turns \
- actor_rollout_ref.rollout.multi_turn.format=hermes \
- actor_rollout_ref.rollout.agent.agent_loop_config_path=$agent_loop_config_path \
- actor_rollout_ref.rollout.gpu_memory_utilization=0.9 \
- actor_rollout_ref.rollout.n=$n_resp_per_prompt \
- actor_rollout_ref.rollout.val_kwargs.top_p=0.6 \
- actor_rollout_ref.rollout.val_kwargs.temperature=1.0 \
- actor_rollout_ref.rollout.val_kwargs.n=$n_resp_per_prompt_val \
- trainer.logger='["console","wandb"]' \
- trainer.project_name=$project_name \
- trainer.experiment_name=$experiment_name \
- trainer.n_gpus_per_node="$GPUS_PER_NODE" \
- trainer.val_before_train=true \
- trainer.log_val_generations=50 \
- trainer.nnodes="$NNODES" \
- trainer.save_freq=-1 \
- trainer.default_local_dir="$default_local_dir" \
- trainer.test_freq=5 \
- trainer.total_epochs=1 "$@"
\ No newline at end of file
diff --git a/recipe/langgraph_agent/react_agent_loop.py b/recipe/langgraph_agent/react_agent_loop.py
deleted file mode 100644
index 2f8a3e20f0e..00000000000
--- a/recipe/langgraph_agent/react_agent_loop.py
+++ /dev/null
@@ -1,188 +0,0 @@
-# Copyright 2024 Bytedance Ltd. and/or its affiliates
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""
-LangGraph React Agent Loop.
-
-This implementation is exact same as `ToolAgentLoop`.
-
-Ref: https://langchain-ai.github.io/langgraph/tutorials/workflows/
-"""
-
-import logging
-from typing import Any, Literal
-
-from langchain_core.messages import AIMessage
-from langchain_core.runnables import RunnableConfig
-from langgraph.graph import END, MessagesState, StateGraph
-from langgraph.prebuilt import ToolNode
-
-from recipe.langgraph_agent.chat_model import (
- ChatModel,
- MaxTokenExceededError,
- convert_to_agent_output,
-)
-from verl.experimental.agent_loop.agent_loop import AgentLoopBase, AgentLoopOutput
-
-logger = logging.getLogger(__name__)
-
-
-async def call_model(state: MessagesState, config: RunnableConfig):
- model = config["configurable"]["model"]
- sampling_params = config["configurable"]["sampling_params"]
- try:
- message = await model.ainvoke(state["messages"], sampling_params=sampling_params)
- return {"messages": [message]}
- except MaxTokenExceededError:
- # last message is ToolMessage
- return {"messages": []}
-
-
-def should_continue(state: MessagesState, config: RunnableConfig) -> Literal["tools", END]:
- # Safely extract max_assistant_turns from config
- max_assistant_turns = None
- try:
- if config and "configurable" in config:
- max_assistant_turns = config["configurable"].get("max_assistant_turns")
- except Exception as e:
- logger.warning(f"Failed to extract max_assistant_turns from config: {e}")
-
- num_assistant_turns = 0
- for message in state["messages"]:
- if message.type == "ai":
- num_assistant_turns += 1
-
- last_message = state["messages"][-1]
-
- # LLM call failed, e.g: max response length exceeded
- if last_message.type == "tool":
- return END
-
- # max assistant turns exceeded
- # Use a reasonable default limit (25) if max_assistant_turns is not set
- # This prevents infinite loops
- effective_max_turns = max_assistant_turns if max_assistant_turns is not None else 25
- if num_assistant_turns >= effective_max_turns:
- return END
-
- # no tool calls
- if not getattr(last_message, "tool_calls", None):
- return END
-
- return "tools"
-
-
-class ReactAgentLoop(AgentLoopBase):
- # Recursion limit calculation constants
- DEFAULT_MAX_ASSISTANT_TURNS = 25
- MIN_RECURSION_LIMIT = 50
- NODES_PER_TURN = 2 # Each AI turn involves agent + tools nodes
- RECURSION_LIMIT_SAFETY_FACTOR = 1.5 # 50% buffer for edge cases
-
- @classmethod
- def init_class(cls, config, tokenizer, **kwargs):
- if cls._class_initialized:
- return
- cls._class_initialized = True
- print("Performing class-level ReactAgentLoop initialization")
-
- # build graph
- cls.graph = cls.build_graph()
-
- @classmethod
- def build_graph(cls) -> StateGraph:
- workflow = StateGraph(MessagesState)
-
- workflow.add_node("agent", call_model)
- workflow.add_node("tools", ToolNode(cls.tools))
- workflow.set_entry_point("agent")
- workflow.add_conditional_edges(
- "agent",
- should_continue,
- {
- "tools": "tools",
- END: END,
- },
- )
-
- workflow.add_edge("tools", "agent")
- graph = workflow.compile()
- return graph
-
- async def run(self, sampling_params: dict[str, Any], **kwargs) -> AgentLoopOutput:
- messages = list(kwargs["raw_prompt"])
-
- model_path = self.config.actor_rollout_ref.model.path
- model_name = "/".join(model_path.split("/")[-2:])
-
- rollout = self.config.actor_rollout_ref.rollout
- model = ChatModel(
- model=model_name,
- client=self.server_manager,
- tokenizer=self.tokenizer,
- max_tokens=rollout.response_length,
- max_parallel_calls=rollout.multi_turn.max_parallel_calls,
- tool_parser=rollout.multi_turn.format,
- )
-
- model = model.bind_tools(self.tools, tool_choice="any")
-
- # Calculate recursion_limit dynamically based on max_assistant_turns
- max_assistant_turns = (
- rollout.multi_turn.max_assistant_turns
- if rollout.multi_turn.max_assistant_turns
- else self.DEFAULT_MAX_ASSISTANT_TURNS
- )
-
- # Formula: nodes_per_turn * max_turns * safety_buffer, with minimum threshold
- recursion_limit = max(
- self.MIN_RECURSION_LIMIT,
- int(max_assistant_turns * self.NODES_PER_TURN * self.RECURSION_LIMIT_SAFETY_FACTOR),
- )
- logger.info(f"Configured recursion_limit={recursion_limit} (max_assistant_turns={max_assistant_turns})")
-
- config = {
- "configurable": {
- "model": model,
- "sampling_params": sampling_params,
- "max_user_turns": rollout.multi_turn.max_user_turns,
- "max_assistant_turns": rollout.multi_turn.max_assistant_turns,
- },
- "recursion_limit": recursion_limit,
- }
-
- # TODO: how to handle multiple trajectories in an graph invocation?
- # Each graph node may has its own LLM calls and state, e.g:
- # https://github.com/google-gemini/gemini-fullstack-langgraph-quickstart
- try:
- state = await self.graph.ainvoke(input={"messages": messages}, config=config)
- except Exception as e:
- logger.error(f"Agent loop execution failed: {type(e).__name__}: {e}")
- logger.error("Falling back to a minimal dummy trajectory.")
-
- # Fallback to a minimal assistant message so that
- # convert_to_agent_output and downstream padding logic
- # can still run without crashing.
- dummy_id = 0
- fallback_message = AIMessage(
- content="[Agent execution failed - no valid trajectory]",
- response_metadata={
- "request_id": "fallback",
- "prompt_ids": [dummy_id, dummy_id],
- "response_mask": [1],
- },
- )
- state = {"messages": [fallback_message]}
-
- output = convert_to_agent_output(state["messages"], rollout.response_length)
- return output
diff --git a/recipe/langgraph_agent/test_react_agent_loop.py b/recipe/langgraph_agent/test_react_agent_loop.py
deleted file mode 100644
index 88509cc4819..00000000000
--- a/recipe/langgraph_agent/test_react_agent_loop.py
+++ /dev/null
@@ -1,202 +0,0 @@
-# Copyright 2024 Bytedance Ltd. and/or its affiliates
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-import json
-import os
-
-import numpy as np
-import pytest
-import ray
-from langchain_core.tools import tool
-from omegaconf import DictConfig
-
-from recipe.langgraph_agent.react_agent_loop import ReactAgentLoop
-from tests.experimental.agent_loop.agent_utils import init_agent_loop_manager
-from verl.protocol import DataProto
-from verl.utils import hf_tokenizer
-
-
-@pytest.fixture
-def init_config() -> DictConfig:
- from hydra import compose, initialize_config_dir
-
- with initialize_config_dir(config_dir=os.path.abspath("verl/trainer/config")):
- config = compose(config_name="ppo_trainer")
- model_path = "Qwen/Qwen2.5-1.5B-Instruct"
- config.actor_rollout_ref.model.path = model_path
- config.actor_rollout_ref.rollout.name = os.getenv("ROLLOUT_NAME", "vllm")
- config.actor_rollout_ref.rollout.mode = "async"
- config.actor_rollout_ref.rollout.prompt_length = 4096
- config.actor_rollout_ref.rollout.response_length = 4096
- config.actor_rollout_ref.rollout.n = 4
- config.actor_rollout_ref.rollout.agent.num_workers = 2
-
- config.actor_rollout_ref.actor.use_dynamic_bsz = True
- # test sleep/wake_up with fsdp offload
- config.actor_rollout_ref.actor.fsdp_config.param_offload = True
- config.actor_rollout_ref.actor.fsdp_config.optimizer_offload = True
-
- return config
-
-
-@tool(parse_docstring=True)
-def get_current_temperature(location: str, unit: str = "celsius"):
- """Get current temperature at a location.
-
- Args:
- location: The location to get the temperature for, in the format "City, State, Country".
- unit: The unit to return the temperature in. Defaults to "celsius". (choices: ["celsius", "fahrenheit"])
-
- Returns:
- the temperature, the location, and the unit in a dict
- """
- print(f"[DEBUG] get_current_temperature: {location}, {unit}")
- return {
- "temperature": 26.1,
- "location": location,
- "unit": unit,
- }
-
-
-@tool(parse_docstring=True)
-def get_temperature_date(location: str, date: str, unit: str = "celsius"):
- """Get temperature at a location and date.
-
- Args:
- location: The location to get the temperature for, in the format "City, State, Country".
- date: The date to get the temperature for, in the format "Year-Month-Day".
- unit: The unit to return the temperature in. Defaults to "celsius". (choices: ["celsius", "fahrenheit"])
-
- Returns:
- the temperature, the location, the date and the unit in a dict
- """
- print(f"[DEBUG] get_temperature_date: {location}, {date}, {unit}")
- return {
- "temperature": 25.9,
- "location": location,
- "date": date,
- "unit": unit,
- }
-
-
-class TestReactAgentLoop(ReactAgentLoop):
- @classmethod
- def init_class(cls, config, tokenizer, **kwargs):
- # TODO: find better way to configure tools
- cls.tools = [get_current_temperature, get_temperature_date]
- super().init_class(config, tokenizer, **kwargs)
-
-
-def test_react_agent(init_config):
- ray.init(
- runtime_env={
- "env_vars": {
- "TOKENIZERS_PARALLELISM": "true",
- "NCCL_DEBUG": "WARN",
- "VLLM_LOGGING_LEVEL": "INFO",
- "VLLM_USE_V1": "1",
- }
- }
- )
-
- # =========================== 1. Init rollout manager ===========================
- agent_loop_config = [
- {
- "_target_": "recipe.langgraph_agent.test_react_agent_loop.TestReactAgentLoop",
- "name": "react_agent",
- },
- ]
- agent_loop_config_path = "/tmp/agent_loop_config.json"
- with open(agent_loop_config_path, "w") as f:
- json.dump(agent_loop_config, f)
-
- n = 2
- init_config.actor_rollout_ref.rollout.n = n
- # init_config.actor_rollout_ref.rollout.multi_turn.tool_config_path = tool_config_path
- init_config.actor_rollout_ref.rollout.multi_turn.max_parallel_calls = 2
- init_config.actor_rollout_ref.rollout.agent.agent_loop_config_path = agent_loop_config_path
- agent_loop_manager = init_agent_loop_manager(init_config)
-
- # =========================== 2. Generate sequences ===========================
- raw_prompts = [
- [
- {"role": "user", "content": "How are you?"},
- ],
- [
- {"role": "user", "content": "What's the temperature in Los Angeles now?"},
- ],
- [
- {"role": "user", "content": "What's the temperature in New York now?"},
- ],
- [
- {
- "role": "system",
- "content": "You are Qwen, created by Alibaba Cloud. You are a helpful assistant.\n\n"
- "Current Date: 2024-09-30",
- },
- {"role": "user", "content": "What's the temperature in San Francisco now? How about tomorrow?"},
- ],
- ]
- batch = DataProto(
- non_tensor_batch={
- "raw_prompt": np.array([np.array(prompt) for prompt in raw_prompts], dtype=object),
- "agent_name": np.array(["react_agent"] * len(raw_prompts)),
- "data_source": np.array(["openai/gsm8k"] * len(raw_prompts)),
- "reward_model": np.array([{"style": "rule", "ground_truth": "1.0"}] * len(raw_prompts)),
- },
- )
- batch = batch.repeat(n)
- result = agent_loop_manager.generate_sequences(prompts=batch)
- assert len(result) == len(raw_prompts) * n
-
- # Check turns
- num_turns = result.non_tensor_batch["__num_turns__"]
- print(f"num_turns: {num_turns}")
- for i in range(len(num_turns)):
- if i // n == 0:
- # [user, assistant]
- assert num_turns[i] == 2
- else:
- # [user, assistant, tool, assistant]
- assert num_turns[i] == 4
-
- # Check response_mask
- tokenizer = hf_tokenizer(init_config.actor_rollout_ref.model.path)
- responses = result.batch["responses"]
- response_mask = result.batch["response_mask"]
- attention_mask = result.batch["attention_mask"]
- assert responses.size() == response_mask.size(), f"{responses.size()} != {response_mask.size()}"
- response_length = response_mask.size(1)
-
- for i in range(len(responses)):
- # response with tool response
- valid_tokens = responses[i][attention_mask[i][-response_length:].bool()]
- response_with_obs = tokenizer.decode(valid_tokens)
-
- # response without tool response
- valid_tokens = responses[i][response_mask[i].bool()]
- response_without_obs = tokenizer.decode(valid_tokens)
-
- assert "" not in response_without_obs, (
- f"found in response: {response_without_obs}"
- )
- assert "" not in response_without_obs, (
- f"found in response: {response_without_obs}"
- )
- print("=========================")
- print(response_with_obs)
- print("---")
- print(response_without_obs)
-
- print("Test passed!")
- ray.shutdown()
diff --git a/recipe/retool/README.md b/recipe/retool/README.md
deleted file mode 100644
index 0f2dc0651e8..00000000000
--- a/recipe/retool/README.md
+++ /dev/null
@@ -1,56 +0,0 @@
-# Retool
-[ReTool: Reinforcement Learning for Strategic Tool Use in LLMs](https://arxiv.org/abs/2504.11536)
-
-## Overview
-- Base model: [Qwen/Qwen2.5-32B-Instruct](https://huggingface.co/Qwen/Qwen2.5-32B-Instruct)
-- SFT dataset: [JoeYing/ReTool-SFT](https://huggingface.co/datasets/JoeYing/ReTool-SFT)
-- RL dataset: [BytedTsinghua-SIA/DAPO-Math-17k](https://huggingface.co/datasets/BytedTsinghua-SIA/DAPO-Math-17k)
-- Val dataset: [yentinglin/aime_2025](https://huggingface.co/datasets/yentinglin/aime_2025)
-
-## How it works
-
- Retool's workflow is divided into two key phases:
-
-1. Cold Start and Supervised Fine Tuning (SFT)
-
- The data generation pipeline builds a high-quality dataset containing code-enhanced inference trajectories, and supervised fine-tuning enables the model to master basic Tool call (e.g., code execution) and analysis of the execution results.
-
-2. Dynamic Interaction and Policy Optimization (RL).
-
- With the verl Reinforcement Learning framework, the model dynamically inserts code blocks during inference and interacts with the sandbox environment in real-time, generating a hybrid trajectory of natural language thinking and code snippets, sending the code to the sandbox for asynchronous execution when code termination markers are detected, and the execution results (success outputs/errors) are fed back to the model for guiding the subsequent inference. This "think-execute-feedback" cycle, together with the design of rewards based on the accuracy of the final answer, enables the model to independently optimize the Tool call strategy, and improves the reasoning efficiency and computational accuracy.
-
-## SFT
-1. Data preparation
-```bash
-python3 recipe/retool/retool_sft_preprocess.py
-```
-
-2. Training
-```bash
-bash recipe/retool/run_qwen2-32b_sft.sh
-```
-
-After 6 epoches, validation metrics:
-- val-core/aime_2025/acc/mean@30: 0.24
-- val-aux/num_turns/mean: 7.2
-
-## RL
-
-### GRPO
-```bash
-bash recipe/retool/run_qwen2-32b_dapo.sh
-```
-
-After 150 steps, validation metrics:
-- val-core/aime_2025/acc/mean@30: 0.6
-- val-aux/num_turns/mean: 10
-
-### PPO
-
-```bash
-bash recipe/retool/run_qwen2-32b_ppo.sh
-```
-
-After 250 steps, validation metrics:
-- val-core/aime_2025/acc/mean@30: 0.55
-- val-aux/num_turns/mean: 8.3
diff --git a/recipe/retool/retool.py b/recipe/retool/retool.py
deleted file mode 100644
index 223b36399d0..00000000000
--- a/recipe/retool/retool.py
+++ /dev/null
@@ -1,120 +0,0 @@
-# Copyright 2024 Bytedance Ltd. and/or its affiliates
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-import logging
-import re
-from typing import Any
-
-import datasets
-
-from verl.tools.base_tool import OpenAIFunctionToolSchema
-from verl.tools.sandbox_fusion_tools import SandboxFusionTool
-from verl.utils.dataset import RLHFDataset
-from verl.utils.reward_score import math_dapo
-from verl.utils.rollout_trace import rollout_trace_op
-
-logger = logging.getLogger(__name__)
-
-
-class CustomSandboxFusionTool(SandboxFusionTool):
- def __init__(self, config: dict, tool_schema: OpenAIFunctionToolSchema):
- super().__init__(config, tool_schema)
- self.code_pattern = re.compile(r"```python(.*?)```", re.DOTALL)
-
- @rollout_trace_op
- async def execute(self, instance_id: str, parameters: dict[str, Any], **kwargs) -> tuple[str, float, dict]:
- code = parameters["code"]
- matches = self.code_pattern.findall(code)
- if matches:
- code = matches[0].strip()
-
- # NOTE: some script may not explicitly print result, we need to add a print statement to the end of the script
- lines = code.split("\n")
- for i, line in reversed(list(enumerate(lines))):
- if line == "":
- continue
- if not lines[i].startswith("print"):
- lines[i] = f"print({line})"
- break
- code = "\n".join(lines)
-
- timeout = parameters.get("timeout", self.default_timeout)
- language = parameters.get("language", self.default_language)
- if not isinstance(code, str):
- code = str(code)
-
- result = await self.execution_pool.execute.remote(self.execute_code, instance_id, code, timeout, language)
- # sandbox has no score or metrics, use Nones
- return result, None, None
-
-
-answer_format = """\nThe answer format must be: \\boxed{'The final answer goes here.'}"""
-
-
-class CustomRLHFDataset(RLHFDataset):
- """Custom dataset class to process Maxwell-Jia/AIME_2024, yentinglin/aime_2025 datasets."""
-
- def _read_files_and_tokenize(self):
- dataframes = []
- for parquet_file in self.data_files:
- # read parquet files and cache
- dataframe = datasets.load_dataset(parquet_file)["train"]
- data_source = "/".join(parquet_file.split("/")[-2:])
- if data_source in ["Maxwell-Jia/AIME_2024", "yentinglin/aime_2025"]:
- dataframe = dataframe.map(
- self.map_fn, fn_kwargs={"data_source": data_source}, remove_columns=dataframe.column_names
- )
- else:
- dataframe = dataframe.map(self.map_fn2, num_proc=16)
- dataframes.append(dataframe)
- self.dataframe: datasets.Dataset = datasets.concatenate_datasets(dataframes)
-
- print(f"dataset len: {len(self.dataframe)}")
-
- def map_fn(self, row: dict, *, data_source: str = None):
- if data_source == "Maxwell-Jia/AIME_2024":
- problem, answer = row["Problem"], row["Answer"]
- elif data_source == "yentinglin/aime_2025":
- problem, answer = row["problem"], row["answer"]
-
- prompt = problem + answer_format
- data = {
- "data_source": data_source.split("/")[1].lower(), # aime_2024, aime_2025
- "prompt": [{"role": "user", "content": prompt}],
- "ability": "MATH",
- "reward_model": {"ground_truth": str(answer)},
- "agent_name": "tool_agent",
- }
- return data
-
- def map_fn2(self, row: dict):
- content = row["prompt"][0]["content"]
- row["prompt"][0]["content"] = content + answer_format
- row["agent_name"] = "tool_agent"
- return row
-
-
-def compute_score(data_source, solution_str, ground_truth, extra_info, **kwargs):
- # use \\boxed{...} answer
- result = math_dapo.compute_score(solution_str, ground_truth, strict_box_verify=True)
-
- # encourage model to call tools
- num_turns = extra_info["num_turns"]
- if result["score"] < 0:
- tool_call_reward = (num_turns - 2) / 2 * 0.1
- result["score"] = min(-0.6, result["score"] + tool_call_reward)
-
- if result["pred"] is None:
- result["pred"] = ""
-
- return result
diff --git a/recipe/retool/retool_sft_preprocess.py b/recipe/retool/retool_sft_preprocess.py
deleted file mode 100644
index 57d53c91c62..00000000000
--- a/recipe/retool/retool_sft_preprocess.py
+++ /dev/null
@@ -1,136 +0,0 @@
-# Copyright 2024 Bytedance Ltd. and/or its affiliates
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""
-Convert JoeYing/ReTool-SFT to standard multi-turn tool calling messages.
-"""
-
-import json
-import os
-import re
-from typing import Any
-
-import datasets
-from omegaconf import OmegaConf
-
-code_pattern = re.compile(r"```python(.*?)```", re.DOTALL)
-
-
-def extract_code_message(content: str) -> tuple[dict[str, Any], str]:
- start, stop = "", ""
- i = content.find(start)
- if i == -1:
- return None, content
- j = content.find(stop)
- assert j > i
-
- code = content[i + len(start) : j]
- matches = code_pattern.findall(code)
- if matches:
- code = matches[0].strip()
-
- message = {
- "role": "assistant",
- "content": content[:i].strip(),
- "tool_calls": [
- {
- "type": "function",
- "function": {
- "name": "code_interpreter",
- "arguments": {"code": code},
- },
- },
- ],
- }
- return message, content[j + len(stop) :]
-
-
-def extract_answer_message(content: str) -> tuple[dict[str, Any], str]:
- start, stop = "", ""
- i = content.find(start)
- if i == -1:
- return None, content
- j = content.find(stop)
- assert j > i
-
- answer = content[:i] + content[i + len(start) : j]
- message = {
- "role": "assistant",
- "content": answer.strip(),
- }
- return message, content[j + len(stop) :]
-
-
-def extract_interpreter_message(content: str) -> tuple[dict[str, Any], str]:
- start, stop = "", ""
- i = content.find(start)
- if i == -1:
- return None, content
- j = content.find(stop)
- assert j > i
-
- interpreter = content[i + len(start) : j]
- message = {
- "role": "tool",
- "content": interpreter.strip(),
- }
- return message, content[j + len(stop) :]
-
-
-def process(row: dict, *, tools: str):
- messages = []
-
- # extract problem
- content = row["messages"][0]["content"]
- start = "*user question:*"
- i = content.find(start)
- assert i != -1
- prompt = content[i + len(start) :].replace("", "").replace("", "").strip()
- messages.append(
- {
- "role": "user",
- "content": prompt,
- }
- )
-
- # extract multi turns
- content = row["messages"][1]["content"]
- role = "assistant"
- while len(content) > 0:
- if role == "assistant":
- message, content = extract_code_message(content)
- if message is None:
- message, content = extract_answer_message(content)
- assert message is not None
- messages.append(message)
- role = "tool"
- else:
- message, content = extract_interpreter_message(content)
- assert message is not None
- messages.append(message)
- role = "assistant"
-
- tools = json.loads(tools)
- return {"messages": messages, "tools": tools}
-
-
-if __name__ == "__main__":
- tools_config_file = "recipe/retool/sandbox_fusion_tool_config.yaml"
- tools_config = OmegaConf.load(tools_config_file)
- tool_schema = OmegaConf.to_container(tools_config["tools"][0]["tool_schema"])
- tools = json.dumps([tool_schema])
-
- data = datasets.load_dataset("JoeYing/ReTool-SFT")["train"]
- data = data.map(process, fn_kwargs={"tools": tools})
- save_path = os.path.expanduser("~/ReTool-SFT/data/train-00000-of-00001.parquet")
- data.to_parquet(save_path)
diff --git a/recipe/retool/run_gpt_oss_ppo.sh b/recipe/retool/run_gpt_oss_ppo.sh
deleted file mode 100644
index 6519a45ca73..00000000000
--- a/recipe/retool/run_gpt_oss_ppo.sh
+++ /dev/null
@@ -1,125 +0,0 @@
-set -x
-
-# ================= data/model/tool =================
-HDFS_ROOT=${HDFS_ROOT:-$PWD}
-DATA_ROOT=${DATA_ROOT:-$PWD}
-
-dapo_math_17k=$DATA_ROOT/dataset/BytedTsinghua-SIA/DAPO-Math-17k
-aime_2024=$DATA_ROOT/dataset/Maxwell-Jia/AIME_2024
-aime_2025=$DATA_ROOT/dataset/yentinglin/aime_2025
-actor_model_path=lmsys/gpt-oss-20b-bf16
-critic_model_path=$actor_model_path
-
-train_files="['$dapo_math_17k']"
-test_files="['$aime_2025']"
-
-# tool
-tool_config_path=recipe/retool/sandbox_fusion_tool_config.yaml
-
-# wandb
-project_name=wuxibin_retool
-experiment_name=gpt-oss-20b-bf16_ppo
-default_local_dir=$DATA_ROOT/checkpoint/$experiment_name
-
-# ================= algorithm =================
-adv_estimator=gae
-
-use_kl_in_reward=False
-kl_coef=0.0
-use_kl_loss=False
-kl_loss_coef=0.0
-
-clip_ratio_low=0.2
-clip_ratio_high=0.28
-
-max_turns=8
-max_prompt_length=2048
-max_response_length=16384
-actor_lr=1e-6
-critic_lr=2e-6
-gae_gamma=1.0
-gae_lam=1.0
-
-critic_warmup=20
-
-train_batch_size=512
-ppo_mini_batch_size=512
-n_resp_per_prompt_val=30
-
-# ================= perfomance =================
-infer_tp=4 # vllm
-train_sp=4 # train
-
-offload=True
-
-actor_max_token_len_per_gpu=$(( (max_prompt_length + max_response_length) * 2 ))
-critic_max_token_len_per_gpu=$(( (max_prompt_length + max_response_length) * 4 ))
-
-
-python3 -m verl.trainer.main_ppo \
- algorithm.adv_estimator=$adv_estimator \
- algorithm.use_kl_in_reward=$use_kl_in_reward \
- algorithm.kl_ctrl.kl_coef=$kl_coef \
- algorithm.gamma=$gae_gamma \
- algorithm.lam=$gae_lam \
- data.train_files="$train_files" \
- data.val_files="$test_files" \
- data.return_raw_chat=True \
- data.train_batch_size=$train_batch_size \
- data.max_prompt_length=$max_prompt_length \
- data.max_response_length=$max_response_length \
- data.filter_overlong_prompts=True \
- +data.apply_chat_template_kwargs.reasoning_effort=medium \
- data.truncation='error' \
- data.custom_cls.path=recipe/retool/retool.py \
- data.custom_cls.name=CustomRLHFDataset \
- custom_reward_function.path=recipe/retool/retool.py \
- custom_reward_function.name=compute_score \
- actor_rollout_ref.model.path=$actor_model_path \
- actor_rollout_ref.model.use_remove_padding=True \
- actor_rollout_ref.model.enable_gradient_checkpointing=True \
- actor_rollout_ref.actor.use_kl_loss=$use_kl_loss \
- actor_rollout_ref.actor.kl_loss_coef=$kl_loss_coef \
- actor_rollout_ref.actor.clip_ratio_low=$clip_ratio_low \
- actor_rollout_ref.actor.clip_ratio_high=$clip_ratio_high \
- actor_rollout_ref.actor.clip_ratio_c=10.0 \
- actor_rollout_ref.actor.optim.lr=$actor_lr \
- actor_rollout_ref.actor.use_dynamic_bsz=True \
- actor_rollout_ref.actor.ppo_mini_batch_size=$ppo_mini_batch_size \
- actor_rollout_ref.actor.ppo_max_token_len_per_gpu=$actor_max_token_len_per_gpu \
- actor_rollout_ref.actor.ulysses_sequence_parallel_size=$train_sp \
- actor_rollout_ref.actor.fsdp_config.param_offload=$offload \
- actor_rollout_ref.actor.fsdp_config.optimizer_offload=$offload \
- actor_rollout_ref.rollout.name=sglang \
- actor_rollout_ref.rollout.mode=async \
- actor_rollout_ref.rollout.tensor_model_parallel_size=$infer_tp \
- actor_rollout_ref.rollout.multi_turn.enable=True \
- actor_rollout_ref.rollout.multi_turn.max_user_turns=$max_turns \
- actor_rollout_ref.rollout.multi_turn.max_assistant_turns=$max_turns \
- actor_rollout_ref.rollout.multi_turn.tool_config_path=$tool_config_path \
- actor_rollout_ref.rollout.multi_turn.format=gpt-oss \
- +actor_rollout_ref.rollout.engine_kwargs.sglang.attention_backend=triton \
- actor_rollout_ref.rollout.gpu_memory_utilization=0.8 \
- actor_rollout_ref.rollout.val_kwargs.top_p=1.0 \
- actor_rollout_ref.rollout.val_kwargs.temperature=1.0 \
- actor_rollout_ref.rollout.val_kwargs.n=$n_resp_per_prompt_val \
- critic.optim.lr=$critic_lr \
- critic.model.use_remove_padding=True \
- critic.model.path=$critic_model_path \
- critic.model.enable_gradient_checkpointing=True \
- critic.ppo_max_token_len_per_gpu=$critic_max_token_len_per_gpu \
- critic.ulysses_sequence_parallel_size=$train_sp \
- critic.model.fsdp_config.param_offload=$offload \
- critic.model.fsdp_config.optimizer_offload=$offload \
- trainer.critic_warmup=$critic_warmup \
- trainer.logger=['console','wandb'] \
- trainer.project_name=$project_name \
- trainer.experiment_name=$experiment_name \
- trainer.n_gpus_per_node=8 \
- trainer.val_before_train=True \
- trainer.log_val_generations=100 \
- trainer.nnodes=2 \
- trainer.save_freq=30 \
- trainer.default_local_dir=$default_local_dir \
- trainer.test_freq=5 \
- trainer.total_epochs=1 $@
diff --git a/recipe/retool/run_qwen2-32b_dapo.sh b/recipe/retool/run_qwen2-32b_dapo.sh
deleted file mode 100644
index 2df380da24c..00000000000
--- a/recipe/retool/run_qwen2-32b_dapo.sh
+++ /dev/null
@@ -1,107 +0,0 @@
-set -x
-
-# ================= data/model/tool =================
-HDFS_ROOT=${HDFS_ROOT:-$PWD}
-DATA_ROOT=${DATA_ROOT:-$PWD}
-
-dapo_math_17k=$DATA_ROOT/dataset/BytedTsinghua-SIA/DAPO-Math-17k
-aime_2024=$DATA_ROOT/dataset/Maxwell-Jia/AIME_2024
-aime_2025=$DATA_ROOT/dataset/yentinglin/aime_2025
-model_path=$HDFS_ROOT/checkpoint/multiturn-sft-qwen-2.5-32b-instruct/global_step_372
-
-train_files="['$dapo_math_17k']"
-test_files="['$aime_2025']"
-
-# tool
-tool_config_path=recipe/retool/sandbox_fusion_tool_config.yaml
-
-# wandb
-project_name=wuxibin_retool
-experiment_name=qwen2.5-32b_dapo
-default_local_dir=$DATA_ROOT/checkpoint/$experiment_name
-
-# ================= algorithm =================
-adv_estimator=grpo
-
-use_kl_in_reward=False
-kl_coef=0.0
-use_kl_loss=False
-kl_loss_coef=0.0
-
-clip_ratio_low=0.2
-clip_ratio_high=0.28
-
-max_turns=8
-max_prompt_length=2048
-max_response_length=16384
-actor_lr=1e-6
-
-train_batch_size=512
-ppo_mini_batch_size=64
-n_resp_per_prompt=16
-n_resp_per_prompt_val=30
-
-# ================= perfomance =================
-infer_tp=4 # vllm
-train_sp=8 # train
-offload=True
-
-actor_max_token_len_per_gpu=$(( (max_prompt_length + max_response_length) * 1 ))
-log_prob_max_token_len_per_gpu=$(( actor_max_token_len_per_gpu * 4 ))
-
-python3 -m verl.trainer.main_ppo \
- algorithm.adv_estimator=$adv_estimator \
- algorithm.use_kl_in_reward=$use_kl_in_reward \
- algorithm.kl_ctrl.kl_coef=$kl_coef \
- data.train_files="$train_files" \
- data.val_files="$test_files" \
- data.return_raw_chat=True \
- data.train_batch_size=$train_batch_size \
- data.max_prompt_length=$max_prompt_length \
- data.max_response_length=$max_response_length \
- data.filter_overlong_prompts=True \
- data.truncation='error' \
- data.custom_cls.path=recipe/retool/retool.py \
- data.custom_cls.name=CustomRLHFDataset \
- custom_reward_function.path=recipe/retool/retool.py \
- custom_reward_function.name=compute_score \
- actor_rollout_ref.model.path=$model_path \
- actor_rollout_ref.model.use_remove_padding=True \
- actor_rollout_ref.model.enable_gradient_checkpointing=True \
- actor_rollout_ref.actor.use_kl_loss=$use_kl_loss \
- actor_rollout_ref.actor.kl_loss_coef=$kl_loss_coef \
- actor_rollout_ref.actor.clip_ratio_low=$clip_ratio_low \
- actor_rollout_ref.actor.clip_ratio_high=$clip_ratio_high \
- actor_rollout_ref.actor.clip_ratio_c=10.0 \
- actor_rollout_ref.actor.optim.lr=$actor_lr \
- actor_rollout_ref.actor.use_dynamic_bsz=True \
- actor_rollout_ref.actor.ppo_mini_batch_size=$ppo_mini_batch_size \
- actor_rollout_ref.actor.ppo_max_token_len_per_gpu=$actor_max_token_len_per_gpu \
- actor_rollout_ref.actor.ulysses_sequence_parallel_size=$train_sp \
- actor_rollout_ref.actor.fsdp_config.param_offload=$offload \
- actor_rollout_ref.actor.fsdp_config.optimizer_offload=$offload \
- actor_rollout_ref.ref.log_prob_max_token_len_per_gpu=$log_prob_max_token_len_per_gpu \
- actor_rollout_ref.rollout.name=vllm \
- actor_rollout_ref.rollout.mode=async \
- actor_rollout_ref.rollout.tensor_model_parallel_size=$infer_tp \
- actor_rollout_ref.rollout.multi_turn.enable=True \
- actor_rollout_ref.rollout.multi_turn.max_user_turns=$max_turns \
- actor_rollout_ref.rollout.multi_turn.max_assistant_turns=$max_turns \
- actor_rollout_ref.rollout.multi_turn.tool_config_path=$tool_config_path \
- actor_rollout_ref.rollout.multi_turn.format=hermes \
- actor_rollout_ref.rollout.gpu_memory_utilization=0.9 \
- actor_rollout_ref.rollout.n=$n_resp_per_prompt \
- actor_rollout_ref.rollout.val_kwargs.top_p=0.6 \
- actor_rollout_ref.rollout.val_kwargs.temperature=1.0 \
- actor_rollout_ref.rollout.val_kwargs.n=$n_resp_per_prompt_val \
- trainer.logger=['console','wandb'] \
- trainer.project_name=$project_name \
- trainer.experiment_name=$experiment_name \
- trainer.n_gpus_per_node=8 \
- trainer.val_before_train=True \
- trainer.log_val_generations=100 \
- trainer.nnodes=2 \
- trainer.save_freq=30 \
- trainer.default_local_dir=$default_local_dir \
- trainer.test_freq=5 \
- trainer.total_epochs=1 $@
diff --git a/recipe/retool/run_qwen2-32b_ppo.sh b/recipe/retool/run_qwen2-32b_ppo.sh
deleted file mode 100644
index 1e3ef2cd7fb..00000000000
--- a/recipe/retool/run_qwen2-32b_ppo.sh
+++ /dev/null
@@ -1,123 +0,0 @@
-set -x
-
-# ================= data/model/tool =================
-HDFS_ROOT=${HDFS_ROOT:-$PWD}
-DATA_ROOT=${DATA_ROOT:-$PWD}
-
-dapo_math_17k=$DATA_ROOT/dataset/BytedTsinghua-SIA/DAPO-Math-17k
-aime_2024=$DATA_ROOT/dataset/Maxwell-Jia/AIME_2024
-aime_2025=$DATA_ROOT/dataset/yentinglin/aime_2025
-actor_model_path=$HDFS_ROOT/checkpoint/multiturn-sft-qwen-2.5-32b-instruct/global_step_372
-critic_model_path=$actor_model_path
-
-train_files="['$dapo_math_17k']"
-test_files="['$aime_2025']"
-
-# tool
-tool_config_path=recipe/retool/sandbox_fusion_tool_config.yaml
-
-# wandb
-project_name=wuxibin_retool
-experiment_name=qwen2.5-32b_ppo
-default_local_dir=$DATA_ROOT/checkpoint/$experiment_name
-
-# ================= algorithm =================
-adv_estimator=gae
-
-use_kl_in_reward=False
-kl_coef=0.0
-use_kl_loss=False
-kl_loss_coef=0.0
-
-clip_ratio_low=0.2
-clip_ratio_high=0.28
-
-max_turns=8
-max_prompt_length=2048
-max_response_length=16384
-actor_lr=1e-6
-critic_lr=2e-6
-gae_gamma=1.0
-gae_lam=1.0
-
-critic_warmup=20
-
-train_batch_size=1024
-ppo_mini_batch_size=256
-n_resp_per_prompt_val=30
-
-# ================= perfomance =================
-infer_tp=4 # vllm
-train_sp=4 # train
-
-offload=True
-
-actor_max_token_len_per_gpu=$(( (max_prompt_length + max_response_length) * 2 ))
-critic_max_token_len_per_gpu=$(( (max_prompt_length + max_response_length) * 4 ))
-
-
-python3 -m verl.trainer.main_ppo \
- algorithm.adv_estimator=$adv_estimator \
- algorithm.use_kl_in_reward=$use_kl_in_reward \
- algorithm.kl_ctrl.kl_coef=$kl_coef \
- algorithm.gamma=$gae_gamma \
- algorithm.lam=$gae_lam \
- data.train_files="$train_files" \
- data.val_files="$test_files" \
- data.return_raw_chat=True \
- data.train_batch_size=$train_batch_size \
- data.max_prompt_length=$max_prompt_length \
- data.max_response_length=$max_response_length \
- data.filter_overlong_prompts=True \
- data.truncation='error' \
- data.custom_cls.path=recipe/retool/retool.py \
- data.custom_cls.name=CustomRLHFDataset \
- custom_reward_function.path=recipe/retool/retool.py \
- custom_reward_function.name=compute_score \
- actor_rollout_ref.model.path=$actor_model_path \
- actor_rollout_ref.model.use_remove_padding=True \
- actor_rollout_ref.model.enable_gradient_checkpointing=True \
- actor_rollout_ref.actor.use_kl_loss=$use_kl_loss \
- actor_rollout_ref.actor.kl_loss_coef=$kl_loss_coef \
- actor_rollout_ref.actor.clip_ratio_low=$clip_ratio_low \
- actor_rollout_ref.actor.clip_ratio_high=$clip_ratio_high \
- actor_rollout_ref.actor.clip_ratio_c=10.0 \
- actor_rollout_ref.actor.optim.lr=$actor_lr \
- actor_rollout_ref.actor.use_dynamic_bsz=True \
- actor_rollout_ref.actor.ppo_mini_batch_size=$ppo_mini_batch_size \
- actor_rollout_ref.actor.ppo_max_token_len_per_gpu=$actor_max_token_len_per_gpu \
- actor_rollout_ref.actor.ulysses_sequence_parallel_size=$train_sp \
- actor_rollout_ref.actor.fsdp_config.param_offload=$offload \
- actor_rollout_ref.actor.fsdp_config.optimizer_offload=$offload \
- actor_rollout_ref.rollout.name=vllm \
- actor_rollout_ref.rollout.mode=async \
- actor_rollout_ref.rollout.tensor_model_parallel_size=$infer_tp \
- actor_rollout_ref.rollout.multi_turn.enable=True \
- actor_rollout_ref.rollout.multi_turn.max_user_turns=$max_turns \
- actor_rollout_ref.rollout.multi_turn.max_assistant_turns=$max_turns \
- actor_rollout_ref.rollout.multi_turn.tool_config_path=$tool_config_path \
- actor_rollout_ref.rollout.multi_turn.format=hermes \
- actor_rollout_ref.rollout.gpu_memory_utilization=0.9 \
- actor_rollout_ref.rollout.val_kwargs.top_p=0.6 \
- actor_rollout_ref.rollout.val_kwargs.temperature=1.0 \
- actor_rollout_ref.rollout.val_kwargs.n=$n_resp_per_prompt_val \
- critic.optim.lr=$critic_lr \
- critic.model.use_remove_padding=True \
- critic.model.path=$critic_model_path \
- critic.model.enable_gradient_checkpointing=True \
- critic.ppo_max_token_len_per_gpu=$critic_max_token_len_per_gpu \
- critic.ulysses_sequence_parallel_size=$train_sp \
- critic.model.fsdp_config.param_offload=$offload \
- critic.model.fsdp_config.optimizer_offload=$offload \
- trainer.critic_warmup=$critic_warmup \
- trainer.logger=['console','wandb'] \
- trainer.project_name=$project_name \
- trainer.experiment_name=$experiment_name \
- trainer.n_gpus_per_node=8 \
- trainer.val_before_train=True \
- trainer.log_val_generations=100 \
- trainer.nnodes=2 \
- trainer.save_freq=30 \
- trainer.default_local_dir=$default_local_dir \
- trainer.test_freq=5 \
- trainer.total_epochs=1 $@
diff --git a/recipe/retool/run_qwen2-32b_sft.sh b/recipe/retool/run_qwen2-32b_sft.sh
deleted file mode 100644
index d218b0e7eb8..00000000000
--- a/recipe/retool/run_qwen2-32b_sft.sh
+++ /dev/null
@@ -1,40 +0,0 @@
-#!/bin/bash
-set -x
-
-nnodes=2
-nproc_per_node=8
-master_addr=
-master_port=
-
-experiment_name=multiturn-sft-qwen-2.5-32b-instruct
-HDFS_ROOT=${HDFS_ROOT:-$PWD}
-DATA_ROOT=${DATA_ROOT:-$PWD}
-
-TRAIN_DATA=$DATA_ROOT/dataset/wuxibin/ReTool-SFT/data/train-00000-of-00001.parquet
-EVAL_DATA=$DATA_ROOT/dataset/wuxibin/ReTool-SFT/data/train-00000-of-00001.parquet
-MODEL_PATH=$HDFS_ROOT/model/Qwen2.5-32B-Instruct
-SAVE_PATH=$DATA_ROOT/checkpoint/$experiment_name
-
-torchrun --nnodes=$nnodes \
- --nproc_per_node=$nproc_per_node \
- --master-addr=$master_addr \
- --master-port=$master_port \
- --node-rank=$node_rank \
- -m verl.trainer.fsdp_sft_trainer \
- data.train_files=$TRAIN_DATA \
- data.val_files=$EVAL_DATA \
- data.max_length=16384 \
- data.train_batch_size=32 \
- data.multiturn.enable=true \
- data.multiturn.messages_key=messages \
- data.multiturn.tools_key=tools \
- data.micro_batch_size_per_gpu=4 \
- model.partial_pretrain=$MODEL_PATH \
- model.strategy=fsdp \
- trainer.default_local_dir=$SAVE_PATH \
- trainer.project_name=wuxibin-multiturn-sft \
- trainer.experiment_name=$experiment_name \
- trainer.logger='["console","wandb"]' \
- trainer.total_epochs=6 \
- ulysses_sequence_parallel_size=4 \
- use_remove_padding=true
\ No newline at end of file
diff --git a/recipe/retool/run_qwen2_7b_dapo.sh b/recipe/retool/run_qwen2_7b_dapo.sh
deleted file mode 100644
index f1187a3d12d..00000000000
--- a/recipe/retool/run_qwen2_7b_dapo.sh
+++ /dev/null
@@ -1,109 +0,0 @@
-set -x
-
-export VLLM_USE_V1=1
-
-# ================= data/model/tool =================
-HDFS_ROOT=${HDFS_ROOT:-$PWD}
-DATA_ROOT=${DATA_ROOT:-$PWD}
-
-dapo_math_17k=$DATA_ROOT/dataset/BytedTsinghua-SIA/DAPO-Math-17k
-aime_2024=$DATA_ROOT/dataset/Maxwell-Jia/AIME_2024
-aime_2025=$DATA_ROOT/dataset/yentinglin/aime_2025
-model_path=$HDFS_ROOT/checkpoint/multiturn-sft-qwen-2.5-7b-instruct/global_step_372
-
-train_files="['$dapo_math_17k']"
-test_files="['$aime_2025', '$aime_2024']"
-
-# tool
-tool_config_path=recipe/retool/sandbox_fusion_tool_config.yaml
-
-# wandb
-project_name=retool
-experiment_name=qwen2.5-7b_dapo
-default_local_dir=$DATA_ROOT/checkpoint/$experiment_name
-
-# ================= algorithm =================
-adv_estimator=grpo
-
-use_kl_in_reward=False
-kl_coef=0.0
-use_kl_loss=False
-kl_loss_coef=0.0
-
-clip_ratio_low=0.2
-clip_ratio_high=0.28
-
-max_turns=16
-max_prompt_length=2048
-max_response_length=16384
-actor_lr=1e-6
-
-train_batch_size=64
-ppo_mini_batch_size=16
-n_resp_per_prompt=16
-n_resp_per_prompt_val=30
-
-# ================= perfomance =================
-infer_tp=4 # vllm
-train_sp=4 # train
-offload=True
-
-actor_max_token_len_per_gpu=$(( (max_prompt_length + max_response_length) * 1 ))
-log_prob_max_token_len_per_gpu=$(( actor_max_token_len_per_gpu * 4 ))
-
-python3 -m verl.trainer.main_ppo \
- algorithm.adv_estimator=$adv_estimator \
- algorithm.use_kl_in_reward=$use_kl_in_reward \
- algorithm.kl_ctrl.kl_coef=$kl_coef \
- data.train_files="$train_files" \
- data.val_files="$test_files" \
- data.return_raw_chat=True \
- data.train_batch_size=$train_batch_size \
- data.max_prompt_length=$max_prompt_length \
- data.max_response_length=$max_response_length \
- data.filter_overlong_prompts=True \
- data.truncation='error' \
- data.custom_cls.path=recipe/retool/retool.py \
- data.custom_cls.name=CustomRLHFDataset \
- custom_reward_function.path=recipe/retool/retool.py \
- custom_reward_function.name=compute_score \
- actor_rollout_ref.model.path=$model_path \
- actor_rollout_ref.model.use_remove_padding=True \
- actor_rollout_ref.model.enable_gradient_checkpointing=True \
- actor_rollout_ref.actor.use_kl_loss=$use_kl_loss \
- actor_rollout_ref.actor.kl_loss_coef=$kl_loss_coef \
- actor_rollout_ref.actor.clip_ratio_low=$clip_ratio_low \
- actor_rollout_ref.actor.clip_ratio_high=$clip_ratio_high \
- actor_rollout_ref.actor.clip_ratio_c=10.0 \
- actor_rollout_ref.actor.optim.lr=$actor_lr \
- actor_rollout_ref.actor.use_dynamic_bsz=True \
- actor_rollout_ref.actor.ppo_mini_batch_size=$ppo_mini_batch_size \
- actor_rollout_ref.actor.ppo_max_token_len_per_gpu=$actor_max_token_len_per_gpu \
- actor_rollout_ref.actor.ulysses_sequence_parallel_size=$train_sp \
- actor_rollout_ref.actor.fsdp_config.param_offload=$offload \
- actor_rollout_ref.actor.fsdp_config.optimizer_offload=$offload \
- actor_rollout_ref.ref.log_prob_max_token_len_per_gpu=$log_prob_max_token_len_per_gpu \
- actor_rollout_ref.rollout.name=vllm \
- actor_rollout_ref.rollout.mode=async \
- actor_rollout_ref.rollout.tensor_model_parallel_size=$infer_tp \
- actor_rollout_ref.rollout.multi_turn.enable=True \
- actor_rollout_ref.rollout.multi_turn.max_user_turns=$max_turns \
- actor_rollout_ref.rollout.multi_turn.max_assistant_turns=$max_turns \
- actor_rollout_ref.rollout.multi_turn.tool_config_path=$tool_config_path \
- actor_rollout_ref.rollout.multi_turn.format=hermes \
- actor_rollout_ref.rollout.gpu_memory_utilization=0.9 \
- actor_rollout_ref.rollout.n=$n_resp_per_prompt \
- actor_rollout_ref.rollout.val_kwargs.top_p=0.6 \
- actor_rollout_ref.rollout.val_kwargs.temperature=1.0 \
- actor_rollout_ref.rollout.val_kwargs.n=$n_resp_per_prompt_val \
- trainer.logger=['console','wandb'] \
- trainer.project_name=$project_name \
- trainer.experiment_name=$experiment_name \
- trainer.n_gpus_per_node=8 \
- trainer.val_before_train=True \
- trainer.log_val_generations=20 \
- trainer.nnodes=1 \
- trainer.save_freq=20 \
- trainer.default_local_dir=$default_local_dir \
- trainer.test_freq=10 \
- trainer.total_epochs=1 $@
diff --git a/recipe/retool/run_qwen2_7b_sft.sh b/recipe/retool/run_qwen2_7b_sft.sh
deleted file mode 100644
index e4369e167e3..00000000000
--- a/recipe/retool/run_qwen2_7b_sft.sh
+++ /dev/null
@@ -1,44 +0,0 @@
-#!/bin/bash
-set -x
-
-nnodes=1
-nproc_per_node=8
-master_addr=
-master_port=
-node_rank=${ARNOLD_ID:-0}
-
-project_name=retool
-experiment_name=multiturn-sft-qwen-2.5-7b-instruct
-
-HDFS_ROOT=${HDFS_ROOT:-$PWD}
-DATA_ROOT=${DATA_ROOT:-$PWD}
-
-TRAIN_DATA=$DATA_ROOT/dataset/wuxibin/ReTool-SFT/data/train-00000-of-00001.parquet
-EVAL_DATA=$DATA_ROOT/dataset/wuxibin/ReTool-SFT/data/train-00000-of-00001.parquet
-MODEL_PATH=$HDFS_ROOT/model/Qwen2.5-7B-Instruct
-SAVE_PATH=$DATA_ROOT/checkpoint/$experiment_name
-
-torchrun --nnodes=$nnodes \
- --nproc_per_node=$nproc_per_node \
- --master-addr=$master_addr \
- --master-port=$master_port \
- --node-rank=$node_rank \
- -m verl.trainer.fsdp_sft_trainer \
- data.train_files=$TRAIN_DATA \
- data.val_files=$EVAL_DATA \
- data.max_length=16384 \
- data.train_batch_size=32 \
- data.multiturn.enable=true \
- data.multiturn.messages_key=messages \
- data.multiturn.tools_key=tools \
- data.micro_batch_size_per_gpu=4 \
- model.partial_pretrain=$MODEL_PATH \
- model.strategy=fsdp \
- trainer.default_local_dir=$SAVE_PATH \
- trainer.project_name=wuxibin-multiturn-sft \
- trainer.experiment_name=$experiment_name \
- trainer.logger='["console","wandb"]' \
- trainer.total_epochs=6 \
- trainer.save_freq=62 \
- ulysses_sequence_parallel_size=4 \
- use_remove_padding=true
\ No newline at end of file
diff --git a/recipe/retool/run_qwen2_7b_sft_npu.sh b/recipe/retool/run_qwen2_7b_sft_npu.sh
deleted file mode 100644
index ba203e6bee0..00000000000
--- a/recipe/retool/run_qwen2_7b_sft_npu.sh
+++ /dev/null
@@ -1,36 +0,0 @@
-#!/bin/bash
-set -x
-
-nnodes=1
-nproc_per_node=8
-
-project_name=retool_sft
-experiment_name=multiturn-sft-qwen-2.5-7b-instruct
-
-TRAIN_DATA=PATH/TO/ReTool-SFT/data/train-00000-of-00001.parquet
-EVAL_DATA=PATH/TO/ReTool-SFT/data/train-00000-of-00001.parquet
-MODEL_PATH=PATH/TO/Qwen2.5-7B-Instruct
-SAVE_PATH=PATH/TO/checkpoint/$experiment_name
-
-torchrun --nnodes=$nnodes \
- --nproc_per_node=$nproc_per_node \
- -m verl.trainer.fsdp_sft_trainer \
- data.train_files=$TRAIN_DATA \
- data.val_files=$EVAL_DATA \
- data.max_length=16384 \
- data.train_batch_size=64 \
- data.multiturn.enable=true \
- data.multiturn.messages_key=messages \
- data.multiturn.tools_key=tools \
- data.micro_batch_size_per_gpu=8 \
- model.partial_pretrain=$MODEL_PATH \
- model.strategy=fsdp \
- trainer.default_local_dir=$SAVE_PATH \
- trainer.project_name=$project_name \
- trainer.experiment_name=$experiment_name \
- trainer.logger='["console"]' \
- trainer.total_epochs=6 \
- trainer.save_freq=10 \
- trainer.device=npu \
- ulysses_sequence_parallel_size=4 \
- use_remove_padding=true
\ No newline at end of file
diff --git a/recipe/retool/sandbox_fusion_tool_config.yaml b/recipe/retool/sandbox_fusion_tool_config.yaml
deleted file mode 100644
index 71b10e50ec9..00000000000
--- a/recipe/retool/sandbox_fusion_tool_config.yaml
+++ /dev/null
@@ -1,24 +0,0 @@
-tools:
- - class_name: "recipe.retool.retool.CustomSandboxFusionTool"
- config:
- sandbox_fusion_url: "http://localhost:8080/run_code"
- num_workers: 128
- enable_global_rate_limit: true
- rate_limit: 128
- default_timeout: 30
- default_language: "python"
- memory_limit_mb: 1024
- type: native
-
- tool_schema:
- type: "function"
- function:
- name: "code_interpreter"
- description: "A tool for executing code."
- parameters:
- type: "object"
- properties:
- code:
- type: "string"
- description: "The code to execute."
- required: ["code"]