diff --git a/nemo_gym/openai_utils.py b/nemo_gym/openai_utils.py index baae614768..b1078148f8 100644 --- a/nemo_gym/openai_utils.py +++ b/nemo_gym/openai_utils.py @@ -78,7 +78,13 @@ from pydantic import BaseModel, ConfigDict, Field from typing_extensions import TypedDict -from nemo_gym.server_utils import MAX_NUM_TRIES, ClientResponse, raise_for_status, request +from nemo_gym.server_utils import ( + _GLOBAL_AIOHTTP_CLIENT_REQUEST_DEBUG, + MAX_NUM_TRIES, + ClientResponse, + raise_for_status, + request, +) ######################################## @@ -466,7 +472,7 @@ async def _request(self, **request_kwargs: Dict) -> ClientResponse: response.raise_for_status() async def _raise_for_status(self, response: ClientResponse, request_kwargs: Dict[str, Any]) -> None: - if not response.ok: + if not response.ok and _GLOBAL_AIOHTTP_CLIENT_REQUEST_DEBUG: print(f"Request kwargs: {json.dumps(request_kwargs)}") await raise_for_status(response) diff --git a/nemo_gym/server_utils.py b/nemo_gym/server_utils.py index e263db430b..b775e97489 100644 --- a/nemo_gym/server_utils.py +++ b/nemo_gym/server_utils.py @@ -171,7 +171,8 @@ async def request( async def raise_for_status(response: ClientResponse) -> None: # pragma: no cover if not response.ok: content = await response.content.read() - print(f"""Request info: {response.request_info} + if _GLOBAL_AIOHTTP_CLIENT_REQUEST_DEBUG: + print(f"""Request info: {response.request_info} Response content: {content}""") try: diff --git a/resources_servers/aviary/README.md b/resources_servers/aviary/README.md new file mode 100644 index 0000000000..d0ef5b06f7 --- /dev/null +++ b/resources_servers/aviary/README.md @@ -0,0 +1,43 @@ +> Keywords: Tool Use, Multi-step Reasoning, Environment Interaction, Scientific Tasks + +This resource server adapts [Aviary environments](https://github.com/Future-House/aviary) into the NeMo Gym resources-server interface, so NeMo Gym agents can interact with Aviary `Environment`s. This allows one to implement tool and environment logic in Aviary, and deploy the environment for inference or training with Gym. + +### Implemented servers in this folder + +- **GSM8K**: `gsm8k_app.py` + - Meant primarily as an example, this implements [GSM8k](https://arxiv.org/abs/2110.14168) as a set of environments equipped with a calculator tool. +- **HotPotQA**: `hotpotqa_app.py` + - The HotPotQA environment asks agents to perform multi-hop question answering on the [HotPotQA dataset](https://aclanthology.org/D18-1259/) +- **BixBench**: `notebook_app.py` + - Implements the [BixBench dataset](https://arxiv.org/abs/2503.00096) as a set of environments that allow execution of a Jupyter notebook. + - Also serves as an example for how to implement notebook-backed environments for other scientific computational tasks. +- **Client/proxy to a remote Aviary dataset server**: `client_app.py` + - A generic interface to an Aviary `TaskDatasetServer`. Can be used to interact with any Aviary environments being served remotely. + + +# Example usage + +Run the GSM8K Aviary resources server together with a model config: + +```bash +config_paths="resources_servers/aviary/configs/gsm8k_aviary.yaml,\ +responses_api_models/vllm_model/configs/vllm_model.yaml" +ng_run "+config_paths=[$config_paths]" +``` + +Then collect rollouts: + +```bash +ng_collect_rollouts \ + +agent_name=gsm8k_aviary_agent +input_jsonl_fpath=resources_servers/aviary/data/example.jsonl \ + +output_jsonl_fpath=resources_servers/aviary/data/example_rollouts.jsonl +``` + +# Licensing information +Code: Apache 2.0 + +Data: MIT (GSM8k), Apache 2.0 (BixBench) + +Dependencies +- nemo_gym: Apache 2.0 +- aviary: Apache 2.0 \ No newline at end of file diff --git a/resources_servers/aviary/__init__.py b/resources_servers/aviary/__init__.py new file mode 100644 index 0000000000..e69de29bb2 diff --git a/resources_servers/aviary/app.py b/resources_servers/aviary/app.py new file mode 100644 index 0000000000..260160a59e --- /dev/null +++ b/resources_servers/aviary/app.py @@ -0,0 +1,164 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import json +import logging +import uuid +from abc import ABC +from collections import defaultdict +from typing import Generic, TypeVar, cast + +from fastapi import FastAPI, Request +from openai.types.responses import FunctionToolParam +from pydantic import ConfigDict, Field + +from aviary.core import ( + Environment, + EnvStateMessage, + Message, + TaskDataset, + Tool, + ToolCall, + ToolCallFunction, + ToolRequestMessage, + ToolResponseMessage, +) +from nemo_gym.base_resources_server import SimpleResourcesServer +from nemo_gym.openai_utils import NeMoGymEasyInputMessage, NeMoGymFunctionCallOutput + +from .schemas import ( + AviaryAgentVerifyRequest, + AviaryAgentVerifyResponse, + AviaryCloseRequest, + AviaryCloseResponse, + AviaryEnvStateEasyInputMessage, + AviaryResourcesServerConfig, + AviarySeedSessionRequest, + AviarySeedSessionResponse, + AviaryStepRequest, + AviaryStepResponse, +) + + +logger = logging.getLogger(__name__) + + +TEnv = TypeVar("TEnv", bound=Environment) +TDataset = TypeVar("TDataset", bound=TaskDataset) + + +def tool_to_function_tool_param(tool: Tool) -> FunctionToolParam: + tool_dump = tool.info.model_dump() + tool_dump["parameters"].setdefault("additionalProperties", False) + return FunctionToolParam(type="function", strict=True, **tool_dump) + + +def obs_msg_to_nemo_gym(obs: Message) -> list[NeMoGymEasyInputMessage]: + # This does some Qwen3-specific things: + # 1. if content is a JSON list, we flatten it to a list of messages. Qwen3's + # chat template doesn't support messages with list contents + # 2. if content contains images (or really any other media), we drop it for now. + # Most of this is what we'd call a HACK. + + is_env_state = isinstance(obs, EnvStateMessage) or (obs.info or {}).get("is_env_state", False) + + dump = obs.model_dump() + try: + content: str | list = json.loads(dump["content"]) + except json.JSONDecodeError: + content = dump["content"] + + flat_content: list[str] = [] + if isinstance(content, list): + flat_content = [c["text"] for c in content if c["type"] == "text"] + else: + flat_content = [content] + + message_cls = AviaryEnvStateEasyInputMessage if is_env_state else NeMoGymEasyInputMessage + return [message_cls.model_validate(dump | {"content": c}) for c in flat_content] + + +class AviaryResourcesServer(SimpleResourcesServer, Generic[TEnv, TDataset], ABC): + model_config = ConfigDict(arbitrary_types_allowed=True) + + config: AviaryResourcesServerConfig + dataset: TDataset + env_id_to_env: dict[str, TEnv] = Field(default_factory=dict) + env_id_to_total_reward: dict[str, float] = Field(default_factory=lambda: defaultdict(float)) + + def setup_webserver(self) -> FastAPI: + app = super().setup_webserver() + app.post("/step")(self.step) + app.post("/close")(self.close) + return app + + async def seed_session(self, request: Request, body: AviarySeedSessionRequest) -> AviarySeedSessionResponse: + """ + Wraps creation of the Aviary environment and calling reset(). + """ + env_id = str(uuid.uuid4()) + env = cast(Environment, self.dataset.get_new_env_by_idx(body.task_idx)) + self.env_id_to_env[env_id] = env + + obs, tools = await env.reset() + return AviarySeedSessionResponse( + env_id=env_id, + obs=[message for o in obs for message in obs_msg_to_nemo_gym(o)], + tools=[tool_to_function_tool_param(t) for t in tools], + ) + + async def step(self, request: Request, body: AviaryStepRequest) -> AviaryStepResponse: + """ + Wraps calling step(). + """ + try: + env = self.env_id_to_env[body.env_id] + + action = ToolRequestMessage( + content=None, + tool_calls=[ + ToolCall(id=a.call_id, function=ToolCallFunction(name=a.name, arguments=json.loads(a.arguments))) + for a in body.action + ], + ) + obs, reward, done, _ = await env.step(action) + + self.env_id_to_total_reward[body.env_id] += reward + + nemo_obs = [ + message + for o in obs + for message in ( + [NeMoGymFunctionCallOutput(call_id=o.tool_call_id, output=o.content)] + if isinstance(o, ToolResponseMessage) + else obs_msg_to_nemo_gym(o) + ) + ] + except Exception: + logger.exception("Error in step") + raise + + return AviaryStepResponse(obs=nemo_obs, reward=reward, done=done) + + async def verify(self, request: Request, body: AviaryAgentVerifyRequest) -> AviaryAgentVerifyResponse: + return AviaryAgentVerifyResponse(**body.model_dump(), reward=self.env_id_to_total_reward[body.response.env_id]) + + async def close(self, request: Request, body: AviaryCloseRequest) -> AviaryCloseResponse: + """ + Closes and deregisters body.env_id. + """ + try: + await self.env_id_to_env.pop(body.env_id).close() + except Exception as e: + return AviaryCloseResponse(message=repr(e), success=False) + return AviaryCloseResponse(message="Success", success=True) diff --git a/resources_servers/aviary/client_app.py b/resources_servers/aviary/client_app.py new file mode 100644 index 0000000000..5e94acc117 --- /dev/null +++ b/resources_servers/aviary/client_app.py @@ -0,0 +1,64 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import os + +from pydantic import field_validator, model_validator + +from aviary.core import TaskDatasetClient, TaskEnvironmentClient +from resources_servers.aviary.app import AviaryResourcesServer, AviaryResourcesServerConfig + + +class AviaryClientResourcesServerConfig(AviaryResourcesServerConfig): + server_url: str | None = None + request_timeout: float | None = 300.0 + api_key: str | None = None + + @field_validator("server_url") + @classmethod + def validate_server_url(cls, v: str | None) -> str | None: + if v is None: + return os.getenv("AVIARY_SERVER_URL") + return v + + @field_validator("api_key") + @classmethod + def validate_api_key(cls, v: str | None) -> str | None: + if v is None: + # Note that if AVIARY_API_KEY is not set, we will return + # None, which is also valid - assuming the server does + # not have auth enabled. + return os.getenv("AVIARY_SERVER_API_KEY") + return v + + +class AviaryClientResourcesServer(AviaryResourcesServer[TaskEnvironmentClient, TaskDatasetClient]): + config: AviaryClientResourcesServerConfig + dataset: TaskDatasetClient + + @model_validator(mode="before") + @classmethod + def load_dataset(cls, data: dict) -> dict: + if "dataset" not in data: + config = data["config"] = AviaryClientResourcesServerConfig.model_validate(data.get("config", {})) + data["dataset"] = TaskDatasetClient( + server_url=config.server_url, + request_timeout=config.request_timeout, + api_key=config.api_key, + catch_http_errors=True, + ) + return data + + +if __name__ == "__main__": + AviaryClientResourcesServer.run_webserver() diff --git a/resources_servers/aviary/configs/aviary.yaml b/resources_servers/aviary/configs/aviary.yaml new file mode 100644 index 0000000000..d3e1a36253 --- /dev/null +++ b/resources_servers/aviary/configs/aviary.yaml @@ -0,0 +1,40 @@ +# NOTE: AviaryResourcesServer is an abstract base server and cannot be instantiated directly, +# so here we run the GSM8k server for tests/examples. Users should use the other configurations +# in this directory to instantiate their desired server. +gsm8k_aviary_resources_server: + resources_servers: + aviary: + entrypoint: gsm8k_app.py + domain: math + verified: false +gsm8k_aviary_agent: + responses_api_agents: + aviary_agent: + entrypoint: app.py + resources_server: + type: resources_servers + name: gsm8k_aviary_resources_server + model_server: + type: responses_api_models + name: policy_model + max_steps: 10 + datasets: + - name: train + type: train + jsonl_fpath: resources_servers/aviary/data/gsm8k_train.jsonl + gitlab_identifier: + dataset_name: PLACEHOLDER + version: 0.0.1 + artifact_fpath: train.jsonl + license: Apache 2.0 + - name: validation + type: validation + jsonl_fpath: resources_servers/aviary/data/gsm8k_validation.jsonl + gitlab_identifier: + dataset_name: PLACEHOLDER + version: 0.0.1 + artifact_fpath: train.jsonl + license: Apache 2.0 + - name: example + type: example + jsonl_fpath: resources_servers/aviary/data/gsm8k_example.jsonl diff --git a/resources_servers/aviary/configs/bixbench_aviary.yaml b/resources_servers/aviary/configs/bixbench_aviary.yaml new file mode 100644 index 0000000000..0bc7d3fd24 --- /dev/null +++ b/resources_servers/aviary/configs/bixbench_aviary.yaml @@ -0,0 +1,16 @@ +bixbench_aviary_resources_server: + resources_servers: + aviary: + entrypoint: notebook_app.py + domain: coding + verified: false +bixbench_aviary_agent: + responses_api_agents: + aviary_agent: + entrypoint: app.py + resources_server: + type: resources_servers + name: bixbench_aviary_resources_server + model_server: + type: responses_api_models + name: policy_model diff --git a/resources_servers/aviary/configs/gsm8k_aviary.yaml b/resources_servers/aviary/configs/gsm8k_aviary.yaml new file mode 100644 index 0000000000..e486570c68 --- /dev/null +++ b/resources_servers/aviary/configs/gsm8k_aviary.yaml @@ -0,0 +1,37 @@ +gsm8k_aviary_resources_server: + resources_servers: + aviary: + entrypoint: gsm8k_app.py + domain: math + verified: false +gsm8k_aviary_agent: + responses_api_agents: + aviary_agent: + entrypoint: app.py + resources_server: + type: resources_servers + name: gsm8k_aviary_resources_server + model_server: + type: responses_api_models + name: policy_model + max_steps: 10 + datasets: + - name: train + type: train + jsonl_fpath: resources_servers/aviary/data/gsm8k_train.jsonl + gitlab_identifier: + dataset_name: PLACEHOLDER + version: 0.0.1 + artifact_fpath: train.jsonl + license: Apache 2.0 + - name: validation + type: validation + jsonl_fpath: resources_servers/aviary/data/gsm8k_validation.jsonl + gitlab_identifier: + dataset_name: PLACEHOLDER + version: 0.0.1 + artifact_fpath: train.jsonl + license: Apache 2.0 + - name: example + type: example + jsonl_fpath: resources_servers/aviary/data/gsm8k_example.jsonl diff --git a/resources_servers/aviary/configs/hotpotqa_aviary.yaml b/resources_servers/aviary/configs/hotpotqa_aviary.yaml new file mode 100644 index 0000000000..2264793eef --- /dev/null +++ b/resources_servers/aviary/configs/hotpotqa_aviary.yaml @@ -0,0 +1,36 @@ +hotpotqa_aviary_resources_server: + resources_servers: + aviary: + entrypoint: hotpotqa_app.py + domain: agent + verified: false +hotpotqa_aviary_agent: + responses_api_agents: + aviary_agent: + entrypoint: app.py + resources_server: + type: resources_servers + name: hotpotqa_aviary_resources_server + model_server: + type: responses_api_models + name: policy_model + datasets: + - name: train + type: train + jsonl_fpath: resources_servers/aviary/data/hotpotqa_train.jsonl + gitlab_identifier: + dataset_name: PLACEHOLDER + version: 0.0.1 + artifact_fpath: train.jsonl + license: Apache 2.0 + - name: validation + type: validation + jsonl_fpath: resources_servers/aviary/data/hotpotqa_validation.jsonl + gitlab_identifier: + dataset_name: PLACEHOLDER + version: 0.0.1 + artifact_fpath: validation.jsonl + license: Apache 2.0 + - name: example + type: example + jsonl_fpath: resources_servers/aviary/data/hotpotqa_example.jsonl diff --git a/resources_servers/aviary/data/.gitignore b/resources_servers/aviary/data/.gitignore new file mode 100644 index 0000000000..4424b6fde0 --- /dev/null +++ b/resources_servers/aviary/data/.gitignore @@ -0,0 +1,5 @@ +*train.jsonl +*validation.jsonl +*train_prepare.jsonl +*validation_prepare.jsonl +*example_prepare.jsonl diff --git a/resources_servers/aviary/data/bixbench_example.jsonl b/resources_servers/aviary/data/bixbench_example.jsonl new file mode 100644 index 0000000000..a6da326540 --- /dev/null +++ b/resources_servers/aviary/data/bixbench_example.jsonl @@ -0,0 +1,5 @@ +{"task_idx": 0} +{"task_idx": 1} +{"task_idx": 2} +{"task_idx": 3} +{"task_idx": 4} diff --git a/resources_servers/aviary/data/example.jsonl b/resources_servers/aviary/data/example.jsonl new file mode 100644 index 0000000000..8ccaf08a8f --- /dev/null +++ b/resources_servers/aviary/data/example.jsonl @@ -0,0 +1,5 @@ +{"task_idx": 0, "responses_create_params": {"input": []}, "agent_ref": {"type": "responses_api_agents", "name": "gsm8k_aviary_agent"}} +{"task_idx": 1, "responses_create_params": {"input": []}, "agent_ref": {"type": "responses_api_agents", "name": "gsm8k_aviary_agent"}} +{"task_idx": 2, "responses_create_params": {"input": []}, "agent_ref": {"type": "responses_api_agents", "name": "gsm8k_aviary_agent"}} +{"task_idx": 3, "responses_create_params": {"input": []}, "agent_ref": {"type": "responses_api_agents", "name": "gsm8k_aviary_agent"}} +{"task_idx": 4, "responses_create_params": {"input": []}, "agent_ref": {"type": "responses_api_agents", "name": "gsm8k_aviary_agent"}} diff --git a/resources_servers/aviary/data/example_metrics.json b/resources_servers/aviary/data/example_metrics.json new file mode 100644 index 0000000000..7502b1fb57 --- /dev/null +++ b/resources_servers/aviary/data/example_metrics.json @@ -0,0 +1,43 @@ +{ + "Number of examples": 5, + "Number of tools": { + "Total # non-null values": 0, + "Average": 0.0, + "Min": 0.0, + "Max": 0.0, + "Median": 0.0, + "Standard deviation": 0.0 + }, + "Json-dumped number of words (proxy for token count)": { + "Total # non-null values": 5, + "Average": 2.0, + "Min": 2.0, + "Max": 2.0, + "Median": 2.0, + "Standard deviation": 0.0 + }, + "Number of turns": { + "Total # non-null values": 0, + "Average": 0.0, + "Min": 0.0, + "Max": 0.0, + "Median": 0.0, + "Standard deviation": 0.0 + }, + "Temperature": { + "Total # non-null values": 0, + "Average": 0.0, + "Min": 0.0, + "Max": 0.0, + "Median": 0.0, + "Standard deviation": 0.0 + }, + "task_idx": { + "Total # non-null values": 5, + "Average": 2.0, + "Min": 0.0, + "Max": 4.0, + "Median": 2.0, + "Standard deviation": 1.58 + } +} \ No newline at end of file diff --git a/resources_servers/aviary/data/example_rollouts.jsonl b/resources_servers/aviary/data/example_rollouts.jsonl new file mode 100644 index 0000000000..2afcf307d6 --- /dev/null +++ b/resources_servers/aviary/data/example_rollouts.jsonl @@ -0,0 +1,5 @@ +{"responses_create_params": {"background": null, "include": null, "input": [], "instructions": null, "max_output_tokens": null, "max_tool_calls": null, "metadata": null, "model": null, "parallel_tool_calls": true, "previous_response_id": null, "prompt": null, "reasoning": null, "service_tier": null, "store": null, "temperature": null, "text": null, "tool_choice": "auto", "tools": [], "top_logprobs": null, "top_p": null, "truncation": null, "user": null, "stream": null}, "response": {"id": "resp_08e560d9508d44728796007bcbfda42b", "created_at": 1765834047.0, "error": null, "incomplete_details": null, "instructions": null, "metadata": null, "model": "gpt-4.1-2025-04-14", "object": "response", "output": [[{"content": "Weng earns $12 an hour for babysitting. Yesterday, she just did 50 minutes of babysitting. How much did she earn?", "role": "user", "type": "message"}, {"arguments": "{\"expr\":\"12 * (50 / 60)\"}", "call_id": "call_0hWFYwBvZghsoIwtRIYxuFKL", "name": "calculator", "type": "function_call", "id": "call_0hWFYwBvZghsoIwtRIYxuFKL", "status": "completed"}, {"call_id": "call_0hWFYwBvZghsoIwtRIYxuFKL", "output": "10", "type": "function_call_output", "id": null, "status": null}], [{"content": "Weng earns $12 an hour for babysitting. Yesterday, she just did 50 minutes of babysitting. How much did she earn?", "role": "user", "type": "message"}, {"arguments": "{\"expr\":\"12 * (50 / 60)\"}", "call_id": "call_0hWFYwBvZghsoIwtRIYxuFKL", "name": "calculator", "type": "function_call", "id": "call_0hWFYwBvZghsoIwtRIYxuFKL", "status": "completed"}, {"call_id": "call_0hWFYwBvZghsoIwtRIYxuFKL", "output": "10", "type": "function_call_output", "id": null, "status": null}, {"id": "msg_399c67b4d5e446e884a3812334779991", "content": [{"annotations": [], "text": "Weng earned $10 for 50 minutes of babysitting at a rate of $12 per hour.", "type": "output_text", "logprobs": null}], "role": "assistant", "status": "completed", "type": "message"}]], "parallel_tool_calls": true, "temperature": null, "tool_choice": "auto", "tools": [{"name": "calculator", "parameters": {"type": "object", "properties": {"expr": {"description": "A valid Python expression.", "title": "Expr", "type": "string"}}, "required": ["expr"], "additionalProperties": false}, "strict": true, "type": "function", "description": "Calculate a mathematical expression."}, {"name": "submit_answer", "parameters": {"type": "object", "properties": {"answer": {"description": "Proposed answer.", "title": "Answer", "type": "string"}}, "required": ["answer"], "additionalProperties": false}, "strict": true, "type": "function", "description": "Submit the proposed answer and check if it is correct. This action is terminal."}], "top_p": null, "background": null, "conversation": null, "max_output_tokens": null, "max_tool_calls": null, "previous_response_id": null, "prompt": null, "prompt_cache_key": null, "reasoning": null, "safety_identifier": null, "service_tier": null, "status": null, "text": null, "top_logprobs": null, "truncation": null, "usage": null, "user": null, "env_id": "49803f6d-1456-4049-9aa5-8e6e13ad8465", "group_id": "0", "contains_transitions": true}, "reward": 0.0, "task_idx": 0, "agent_ref": {"type": "responses_api_agents", "name": "gsm8k_aviary_agent"}} +{"responses_create_params": {"background": null, "include": null, "input": [], "instructions": null, "max_output_tokens": null, "max_tool_calls": null, "metadata": null, "model": null, "parallel_tool_calls": true, "previous_response_id": null, "prompt": null, "reasoning": null, "service_tier": null, "store": null, "temperature": null, "text": null, "tool_choice": "auto", "tools": [], "top_logprobs": null, "top_p": null, "truncation": null, "user": null, "stream": null}, "response": {"id": "resp_f71280db679f4b02b64c492984f48131", "created_at": 1765834047.0, "error": null, "incomplete_details": null, "instructions": null, "metadata": null, "model": "gpt-4.1-2025-04-14", "object": "response", "output": [[{"content": "James writes a 3-page letter to 2 different friends twice a week. How many pages does he write a year?", "role": "user", "type": "message"}, {"arguments": "{\"expr\":\"3 * 2 * 2 * 52\"}", "call_id": "call_QyH5cjN3u8e0gvc7fi1IpUd6", "name": "calculator", "type": "function_call", "id": "call_QyH5cjN3u8e0gvc7fi1IpUd6", "status": "completed"}, {"call_id": "call_QyH5cjN3u8e0gvc7fi1IpUd6", "output": "624", "type": "function_call_output", "id": null, "status": null}], [{"content": "James writes a 3-page letter to 2 different friends twice a week. How many pages does he write a year?", "role": "user", "type": "message"}, {"arguments": "{\"expr\":\"3 * 2 * 2 * 52\"}", "call_id": "call_QyH5cjN3u8e0gvc7fi1IpUd6", "name": "calculator", "type": "function_call", "id": "call_QyH5cjN3u8e0gvc7fi1IpUd6", "status": "completed"}, {"call_id": "call_QyH5cjN3u8e0gvc7fi1IpUd6", "output": "624", "type": "function_call_output", "id": null, "status": null}, {"id": "msg_8f5c086864f642cf9cce35ea199bc40d", "content": [{"annotations": [], "text": "James writes a total of 624 pages a year.", "type": "output_text", "logprobs": null}], "role": "assistant", "status": "completed", "type": "message"}]], "parallel_tool_calls": true, "temperature": null, "tool_choice": "auto", "tools": [{"name": "calculator", "parameters": {"type": "object", "properties": {"expr": {"description": "A valid Python expression.", "title": "Expr", "type": "string"}}, "required": ["expr"], "additionalProperties": false}, "strict": true, "type": "function", "description": "Calculate a mathematical expression."}, {"name": "submit_answer", "parameters": {"type": "object", "properties": {"answer": {"description": "Proposed answer.", "title": "Answer", "type": "string"}}, "required": ["answer"], "additionalProperties": false}, "strict": true, "type": "function", "description": "Submit the proposed answer and check if it is correct. This action is terminal."}], "top_p": null, "background": null, "conversation": null, "max_output_tokens": null, "max_tool_calls": null, "previous_response_id": null, "prompt": null, "prompt_cache_key": null, "reasoning": null, "safety_identifier": null, "service_tier": null, "status": null, "text": null, "top_logprobs": null, "truncation": null, "usage": null, "user": null, "env_id": "16487abd-3208-4cee-8839-e3b53a5e9d48", "group_id": "3", "contains_transitions": true}, "reward": 0.0, "task_idx": 3, "agent_ref": {"type": "responses_api_agents", "name": "gsm8k_aviary_agent"}} +{"responses_create_params": {"background": null, "include": null, "input": [], "instructions": null, "max_output_tokens": null, "max_tool_calls": null, "metadata": null, "model": null, "parallel_tool_calls": true, "previous_response_id": null, "prompt": null, "reasoning": null, "service_tier": null, "store": null, "temperature": null, "text": null, "tool_choice": "auto", "tools": [], "top_logprobs": null, "top_p": null, "truncation": null, "user": null, "stream": null}, "response": {"id": "resp_f8cc4a32bffa4c7692170603fc4b55ba", "created_at": 1765834048.0, "error": null, "incomplete_details": null, "instructions": null, "metadata": null, "model": "gpt-4.1-2025-04-14", "object": "response", "output": [[{"content": "Betty is saving money for a new wallet which costs $100. Betty has only half of the money she needs. Her parents decided to give her $15 for that purpose, and her grandparents twice as much as her parents. How much more money does Betty need to buy the wallet?", "role": "user", "type": "message"}, {"arguments": "{\"expr\": \"100 / 2\"}", "call_id": "call_8j8ITz8XEWAiYd1WcvTENGiG", "name": "calculator", "type": "function_call", "id": "call_8j8ITz8XEWAiYd1WcvTENGiG", "status": "completed"}, {"arguments": "{\"expr\": \"15 * 2\"}", "call_id": "call_q4f6YyPbyAUnJotbouDr3HEe", "name": "calculator", "type": "function_call", "id": "call_q4f6YyPbyAUnJotbouDr3HEe", "status": "completed"}, {"call_id": "call_8j8ITz8XEWAiYd1WcvTENGiG", "output": "50", "type": "function_call_output", "id": null, "status": null}, {"call_id": "call_q4f6YyPbyAUnJotbouDr3HEe", "output": "30", "type": "function_call_output", "id": null, "status": null}], [{"content": "Betty is saving money for a new wallet which costs $100. Betty has only half of the money she needs. Her parents decided to give her $15 for that purpose, and her grandparents twice as much as her parents. How much more money does Betty need to buy the wallet?", "role": "user", "type": "message"}, {"arguments": "{\"expr\": \"100 / 2\"}", "call_id": "call_8j8ITz8XEWAiYd1WcvTENGiG", "name": "calculator", "type": "function_call", "id": "call_8j8ITz8XEWAiYd1WcvTENGiG", "status": "completed"}, {"arguments": "{\"expr\": \"15 * 2\"}", "call_id": "call_q4f6YyPbyAUnJotbouDr3HEe", "name": "calculator", "type": "function_call", "id": "call_q4f6YyPbyAUnJotbouDr3HEe", "status": "completed"}, {"call_id": "call_8j8ITz8XEWAiYd1WcvTENGiG", "output": "50", "type": "function_call_output", "id": null, "status": null}, {"call_id": "call_q4f6YyPbyAUnJotbouDr3HEe", "output": "30", "type": "function_call_output", "id": null, "status": null}, {"arguments": "{\"expr\":\"50 + 15 + 30\"}", "call_id": "call_bwF0HAcro5sxnecMrYOW6S9B", "name": "calculator", "type": "function_call", "id": "call_bwF0HAcro5sxnecMrYOW6S9B", "status": "completed"}, {"call_id": "call_bwF0HAcro5sxnecMrYOW6S9B", "output": "95", "type": "function_call_output", "id": null, "status": null}], [{"content": "Betty is saving money for a new wallet which costs $100. Betty has only half of the money she needs. Her parents decided to give her $15 for that purpose, and her grandparents twice as much as her parents. How much more money does Betty need to buy the wallet?", "role": "user", "type": "message"}, {"arguments": "{\"expr\": \"100 / 2\"}", "call_id": "call_8j8ITz8XEWAiYd1WcvTENGiG", "name": "calculator", "type": "function_call", "id": "call_8j8ITz8XEWAiYd1WcvTENGiG", "status": "completed"}, {"arguments": "{\"expr\": \"15 * 2\"}", "call_id": "call_q4f6YyPbyAUnJotbouDr3HEe", "name": "calculator", "type": "function_call", "id": "call_q4f6YyPbyAUnJotbouDr3HEe", "status": "completed"}, {"call_id": "call_8j8ITz8XEWAiYd1WcvTENGiG", "output": "50", "type": "function_call_output", "id": null, "status": null}, {"call_id": "call_q4f6YyPbyAUnJotbouDr3HEe", "output": "30", "type": "function_call_output", "id": null, "status": null}, {"arguments": "{\"expr\":\"50 + 15 + 30\"}", "call_id": "call_bwF0HAcro5sxnecMrYOW6S9B", "name": "calculator", "type": "function_call", "id": "call_bwF0HAcro5sxnecMrYOW6S9B", "status": "completed"}, {"call_id": "call_bwF0HAcro5sxnecMrYOW6S9B", "output": "95", "type": "function_call_output", "id": null, "status": null}, {"arguments": "{\"expr\":\"100 - 95\"}", "call_id": "call_pjRjq3CS4fjJw6RUos1GgUEt", "name": "calculator", "type": "function_call", "id": "call_pjRjq3CS4fjJw6RUos1GgUEt", "status": "completed"}, {"call_id": "call_pjRjq3CS4fjJw6RUos1GgUEt", "output": "5", "type": "function_call_output", "id": null, "status": null}], [{"content": "Betty is saving money for a new wallet which costs $100. Betty has only half of the money she needs. Her parents decided to give her $15 for that purpose, and her grandparents twice as much as her parents. How much more money does Betty need to buy the wallet?", "role": "user", "type": "message"}, {"arguments": "{\"expr\": \"100 / 2\"}", "call_id": "call_8j8ITz8XEWAiYd1WcvTENGiG", "name": "calculator", "type": "function_call", "id": "call_8j8ITz8XEWAiYd1WcvTENGiG", "status": "completed"}, {"arguments": "{\"expr\": \"15 * 2\"}", "call_id": "call_q4f6YyPbyAUnJotbouDr3HEe", "name": "calculator", "type": "function_call", "id": "call_q4f6YyPbyAUnJotbouDr3HEe", "status": "completed"}, {"call_id": "call_8j8ITz8XEWAiYd1WcvTENGiG", "output": "50", "type": "function_call_output", "id": null, "status": null}, {"call_id": "call_q4f6YyPbyAUnJotbouDr3HEe", "output": "30", "type": "function_call_output", "id": null, "status": null}, {"arguments": "{\"expr\":\"50 + 15 + 30\"}", "call_id": "call_bwF0HAcro5sxnecMrYOW6S9B", "name": "calculator", "type": "function_call", "id": "call_bwF0HAcro5sxnecMrYOW6S9B", "status": "completed"}, {"call_id": "call_bwF0HAcro5sxnecMrYOW6S9B", "output": "95", "type": "function_call_output", "id": null, "status": null}, {"arguments": "{\"expr\":\"100 - 95\"}", "call_id": "call_pjRjq3CS4fjJw6RUos1GgUEt", "name": "calculator", "type": "function_call", "id": "call_pjRjq3CS4fjJw6RUos1GgUEt", "status": "completed"}, {"call_id": "call_pjRjq3CS4fjJw6RUos1GgUEt", "output": "5", "type": "function_call_output", "id": null, "status": null}, {"arguments": "{\"answer\":\"5\"}", "call_id": "call_6AnmCBULDvXugYB38Egh0eYd", "name": "submit_answer", "type": "function_call", "id": "call_6AnmCBULDvXugYB38Egh0eYd", "status": "completed"}, {"call_id": "call_6AnmCBULDvXugYB38Egh0eYd", "output": "True", "type": "function_call_output", "id": null, "status": null}]], "parallel_tool_calls": true, "temperature": null, "tool_choice": "auto", "tools": [{"name": "calculator", "parameters": {"type": "object", "properties": {"expr": {"description": "A valid Python expression.", "title": "Expr", "type": "string"}}, "required": ["expr"], "additionalProperties": false}, "strict": true, "type": "function", "description": "Calculate a mathematical expression."}, {"name": "submit_answer", "parameters": {"type": "object", "properties": {"answer": {"description": "Proposed answer.", "title": "Answer", "type": "string"}}, "required": ["answer"], "additionalProperties": false}, "strict": true, "type": "function", "description": "Submit the proposed answer and check if it is correct. This action is terminal."}], "top_p": null, "background": null, "conversation": null, "max_output_tokens": null, "max_tool_calls": null, "previous_response_id": null, "prompt": null, "prompt_cache_key": null, "reasoning": null, "safety_identifier": null, "service_tier": null, "status": null, "text": null, "top_logprobs": null, "truncation": null, "usage": null, "user": null, "env_id": "22805564-2668-4b0f-af43-a1ffdf10cea3", "group_id": "1", "contains_transitions": true}, "reward": 1.0, "task_idx": 1, "agent_ref": {"type": "responses_api_agents", "name": "gsm8k_aviary_agent"}} +{"responses_create_params": {"background": null, "include": null, "input": [], "instructions": null, "max_output_tokens": null, "max_tool_calls": null, "metadata": null, "model": null, "parallel_tool_calls": true, "previous_response_id": null, "prompt": null, "reasoning": null, "service_tier": null, "store": null, "temperature": null, "text": null, "tool_choice": "auto", "tools": [], "top_logprobs": null, "top_p": null, "truncation": null, "user": null, "stream": null}, "response": {"id": "resp_0c89a59e1f704e64bfb793ea5091a8b4", "created_at": 1765834048.0, "error": null, "incomplete_details": null, "instructions": null, "metadata": null, "model": "gpt-4.1-2025-04-14", "object": "response", "output": [[{"content": "Albert is wondering how much pizza he can eat in one day. He buys 2 large pizzas and 2 small pizzas. A large pizza has 16 slices and a small pizza has 8 slices. If he eats it all, how many pieces does he eat that day?", "role": "user", "type": "message"}, {"arguments": "{\"expr\": \"2 * 16\"}", "call_id": "call_irHIidogP87WYJAzq3zUgOuz", "name": "calculator", "type": "function_call", "id": "call_irHIidogP87WYJAzq3zUgOuz", "status": "completed"}, {"arguments": "{\"expr\": \"2 * 8\"}", "call_id": "call_ZcoA0E9hjlDitLzFK9xhWI3y", "name": "calculator", "type": "function_call", "id": "call_ZcoA0E9hjlDitLzFK9xhWI3y", "status": "completed"}, {"call_id": "call_irHIidogP87WYJAzq3zUgOuz", "output": "32", "type": "function_call_output", "id": null, "status": null}, {"call_id": "call_ZcoA0E9hjlDitLzFK9xhWI3y", "output": "16", "type": "function_call_output", "id": null, "status": null}], [{"content": "Albert is wondering how much pizza he can eat in one day. He buys 2 large pizzas and 2 small pizzas. A large pizza has 16 slices and a small pizza has 8 slices. If he eats it all, how many pieces does he eat that day?", "role": "user", "type": "message"}, {"arguments": "{\"expr\": \"2 * 16\"}", "call_id": "call_irHIidogP87WYJAzq3zUgOuz", "name": "calculator", "type": "function_call", "id": "call_irHIidogP87WYJAzq3zUgOuz", "status": "completed"}, {"arguments": "{\"expr\": \"2 * 8\"}", "call_id": "call_ZcoA0E9hjlDitLzFK9xhWI3y", "name": "calculator", "type": "function_call", "id": "call_ZcoA0E9hjlDitLzFK9xhWI3y", "status": "completed"}, {"call_id": "call_irHIidogP87WYJAzq3zUgOuz", "output": "32", "type": "function_call_output", "id": null, "status": null}, {"call_id": "call_ZcoA0E9hjlDitLzFK9xhWI3y", "output": "16", "type": "function_call_output", "id": null, "status": null}, {"arguments": "{\"expr\":\"32 + 16\"}", "call_id": "call_pb6sKZ4tagrrkoAnZydIFqJO", "name": "calculator", "type": "function_call", "id": "call_pb6sKZ4tagrrkoAnZydIFqJO", "status": "completed"}, {"call_id": "call_pb6sKZ4tagrrkoAnZydIFqJO", "output": "48", "type": "function_call_output", "id": null, "status": null}], [{"content": "Albert is wondering how much pizza he can eat in one day. He buys 2 large pizzas and 2 small pizzas. A large pizza has 16 slices and a small pizza has 8 slices. If he eats it all, how many pieces does he eat that day?", "role": "user", "type": "message"}, {"arguments": "{\"expr\": \"2 * 16\"}", "call_id": "call_irHIidogP87WYJAzq3zUgOuz", "name": "calculator", "type": "function_call", "id": "call_irHIidogP87WYJAzq3zUgOuz", "status": "completed"}, {"arguments": "{\"expr\": \"2 * 8\"}", "call_id": "call_ZcoA0E9hjlDitLzFK9xhWI3y", "name": "calculator", "type": "function_call", "id": "call_ZcoA0E9hjlDitLzFK9xhWI3y", "status": "completed"}, {"call_id": "call_irHIidogP87WYJAzq3zUgOuz", "output": "32", "type": "function_call_output", "id": null, "status": null}, {"call_id": "call_ZcoA0E9hjlDitLzFK9xhWI3y", "output": "16", "type": "function_call_output", "id": null, "status": null}, {"arguments": "{\"expr\":\"32 + 16\"}", "call_id": "call_pb6sKZ4tagrrkoAnZydIFqJO", "name": "calculator", "type": "function_call", "id": "call_pb6sKZ4tagrrkoAnZydIFqJO", "status": "completed"}, {"call_id": "call_pb6sKZ4tagrrkoAnZydIFqJO", "output": "48", "type": "function_call_output", "id": null, "status": null}, {"id": "msg_d0dc33e2f646426ca6116030d0a65c85", "content": [{"annotations": [], "text": "If Albert eats all the pizza, he eats 48 pieces in one day.", "type": "output_text", "logprobs": null}], "role": "assistant", "status": "completed", "type": "message"}]], "parallel_tool_calls": true, "temperature": null, "tool_choice": "auto", "tools": [{"name": "calculator", "parameters": {"type": "object", "properties": {"expr": {"description": "A valid Python expression.", "title": "Expr", "type": "string"}}, "required": ["expr"], "additionalProperties": false}, "strict": true, "type": "function", "description": "Calculate a mathematical expression."}, {"name": "submit_answer", "parameters": {"type": "object", "properties": {"answer": {"description": "Proposed answer.", "title": "Answer", "type": "string"}}, "required": ["answer"], "additionalProperties": false}, "strict": true, "type": "function", "description": "Submit the proposed answer and check if it is correct. This action is terminal."}], "top_p": null, "background": null, "conversation": null, "max_output_tokens": null, "max_tool_calls": null, "previous_response_id": null, "prompt": null, "prompt_cache_key": null, "reasoning": null, "safety_identifier": null, "service_tier": null, "status": null, "text": null, "top_logprobs": null, "truncation": null, "usage": null, "user": null, "env_id": "9eaf24bd-b62d-4abf-b1b8-7c94096fdf7f", "group_id": "4", "contains_transitions": true}, "reward": 0.0, "task_idx": 4, "agent_ref": {"type": "responses_api_agents", "name": "gsm8k_aviary_agent"}} +{"responses_create_params": {"background": null, "include": null, "input": [], "instructions": null, "max_output_tokens": null, "max_tool_calls": null, "metadata": null, "model": null, "parallel_tool_calls": true, "previous_response_id": null, "prompt": null, "reasoning": null, "service_tier": null, "store": null, "temperature": null, "text": null, "tool_choice": "auto", "tools": [], "top_logprobs": null, "top_p": null, "truncation": null, "user": null, "stream": null}, "response": {"id": "resp_1438d43e7a6d4504b03cce669af8ab13", "created_at": 1765834049.0, "error": null, "incomplete_details": null, "instructions": null, "metadata": null, "model": "gpt-4.1-2025-04-14", "object": "response", "output": [[{"content": "Julie is reading a 120-page book. Yesterday, she was able to read 12 pages and today, she read twice as many pages as yesterday. If she wants to read half of the remaining pages tomorrow, how many pages should she read?", "role": "user", "type": "message"}, {"arguments": "{\"expr\":\"12*2\"}", "call_id": "call_Ptj2Zb1CaYgRVhSSJZyS37bK", "name": "calculator", "type": "function_call", "id": "call_Ptj2Zb1CaYgRVhSSJZyS37bK", "status": "completed"}, {"call_id": "call_Ptj2Zb1CaYgRVhSSJZyS37bK", "output": "24", "type": "function_call_output", "id": null, "status": null}], [{"content": "Julie is reading a 120-page book. Yesterday, she was able to read 12 pages and today, she read twice as many pages as yesterday. If she wants to read half of the remaining pages tomorrow, how many pages should she read?", "role": "user", "type": "message"}, {"arguments": "{\"expr\":\"12*2\"}", "call_id": "call_Ptj2Zb1CaYgRVhSSJZyS37bK", "name": "calculator", "type": "function_call", "id": "call_Ptj2Zb1CaYgRVhSSJZyS37bK", "status": "completed"}, {"call_id": "call_Ptj2Zb1CaYgRVhSSJZyS37bK", "output": "24", "type": "function_call_output", "id": null, "status": null}, {"arguments": "{\"expr\":\"120 - 12 - 24\"}", "call_id": "call_Ar70NpaOVSoplw6IQOI9y6fc", "name": "calculator", "type": "function_call", "id": "call_Ar70NpaOVSoplw6IQOI9y6fc", "status": "completed"}, {"call_id": "call_Ar70NpaOVSoplw6IQOI9y6fc", "output": "84", "type": "function_call_output", "id": null, "status": null}], [{"content": "Julie is reading a 120-page book. Yesterday, she was able to read 12 pages and today, she read twice as many pages as yesterday. If she wants to read half of the remaining pages tomorrow, how many pages should she read?", "role": "user", "type": "message"}, {"arguments": "{\"expr\":\"12*2\"}", "call_id": "call_Ptj2Zb1CaYgRVhSSJZyS37bK", "name": "calculator", "type": "function_call", "id": "call_Ptj2Zb1CaYgRVhSSJZyS37bK", "status": "completed"}, {"call_id": "call_Ptj2Zb1CaYgRVhSSJZyS37bK", "output": "24", "type": "function_call_output", "id": null, "status": null}, {"arguments": "{\"expr\":\"120 - 12 - 24\"}", "call_id": "call_Ar70NpaOVSoplw6IQOI9y6fc", "name": "calculator", "type": "function_call", "id": "call_Ar70NpaOVSoplw6IQOI9y6fc", "status": "completed"}, {"call_id": "call_Ar70NpaOVSoplw6IQOI9y6fc", "output": "84", "type": "function_call_output", "id": null, "status": null}, {"arguments": "{\"expr\":\"84 / 2\"}", "call_id": "call_IYOClLlY0cYBehW1fcmYkUk0", "name": "calculator", "type": "function_call", "id": "call_IYOClLlY0cYBehW1fcmYkUk0", "status": "completed"}, {"call_id": "call_IYOClLlY0cYBehW1fcmYkUk0", "output": "42", "type": "function_call_output", "id": null, "status": null}], [{"content": "Julie is reading a 120-page book. Yesterday, she was able to read 12 pages and today, she read twice as many pages as yesterday. If she wants to read half of the remaining pages tomorrow, how many pages should she read?", "role": "user", "type": "message"}, {"arguments": "{\"expr\":\"12*2\"}", "call_id": "call_Ptj2Zb1CaYgRVhSSJZyS37bK", "name": "calculator", "type": "function_call", "id": "call_Ptj2Zb1CaYgRVhSSJZyS37bK", "status": "completed"}, {"call_id": "call_Ptj2Zb1CaYgRVhSSJZyS37bK", "output": "24", "type": "function_call_output", "id": null, "status": null}, {"arguments": "{\"expr\":\"120 - 12 - 24\"}", "call_id": "call_Ar70NpaOVSoplw6IQOI9y6fc", "name": "calculator", "type": "function_call", "id": "call_Ar70NpaOVSoplw6IQOI9y6fc", "status": "completed"}, {"call_id": "call_Ar70NpaOVSoplw6IQOI9y6fc", "output": "84", "type": "function_call_output", "id": null, "status": null}, {"arguments": "{\"expr\":\"84 / 2\"}", "call_id": "call_IYOClLlY0cYBehW1fcmYkUk0", "name": "calculator", "type": "function_call", "id": "call_IYOClLlY0cYBehW1fcmYkUk0", "status": "completed"}, {"call_id": "call_IYOClLlY0cYBehW1fcmYkUk0", "output": "42", "type": "function_call_output", "id": null, "status": null}, {"id": "msg_868b48183b2e4a729583b633d979565b", "content": [{"annotations": [], "text": "Julie should read 42 pages tomorrow, which is half of the 84 pages she will have remaining after reading 12 pages yesterday and 24 pages today.", "type": "output_text", "logprobs": null}], "role": "assistant", "status": "completed", "type": "message"}]], "parallel_tool_calls": true, "temperature": null, "tool_choice": "auto", "tools": [{"name": "calculator", "parameters": {"type": "object", "properties": {"expr": {"description": "A valid Python expression.", "title": "Expr", "type": "string"}}, "required": ["expr"], "additionalProperties": false}, "strict": true, "type": "function", "description": "Calculate a mathematical expression."}, {"name": "submit_answer", "parameters": {"type": "object", "properties": {"answer": {"description": "Proposed answer.", "title": "Answer", "type": "string"}}, "required": ["answer"], "additionalProperties": false}, "strict": true, "type": "function", "description": "Submit the proposed answer and check if it is correct. This action is terminal."}], "top_p": null, "background": null, "conversation": null, "max_output_tokens": null, "max_tool_calls": null, "previous_response_id": null, "prompt": null, "prompt_cache_key": null, "reasoning": null, "safety_identifier": null, "service_tier": null, "status": null, "text": null, "top_logprobs": null, "truncation": null, "usage": null, "user": null, "env_id": "16098ac7-bbc4-4b6a-99c3-51b8e95ce309", "group_id": "2", "contains_transitions": true}, "reward": 0.0, "task_idx": 2, "agent_ref": {"type": "responses_api_agents", "name": "gsm8k_aviary_agent"}} diff --git a/resources_servers/aviary/data/gsm8k_example.jsonl b/resources_servers/aviary/data/gsm8k_example.jsonl new file mode 100644 index 0000000000..573d439dbb --- /dev/null +++ b/resources_servers/aviary/data/gsm8k_example.jsonl @@ -0,0 +1,5 @@ +{"task_idx":0,"responses_create_params":{"input":[]}} +{"task_idx":1,"responses_create_params":{"input":[]}} +{"task_idx":2,"responses_create_params":{"input":[]}} +{"task_idx":3,"responses_create_params":{"input":[]}} +{"task_idx":4,"responses_create_params":{"input":[]}} diff --git a/resources_servers/aviary/data/hotpotqa_example.jsonl b/resources_servers/aviary/data/hotpotqa_example.jsonl new file mode 100644 index 0000000000..573d439dbb --- /dev/null +++ b/resources_servers/aviary/data/hotpotqa_example.jsonl @@ -0,0 +1,5 @@ +{"task_idx":0,"responses_create_params":{"input":[]}} +{"task_idx":1,"responses_create_params":{"input":[]}} +{"task_idx":2,"responses_create_params":{"input":[]}} +{"task_idx":3,"responses_create_params":{"input":[]}} +{"task_idx":4,"responses_create_params":{"input":[]}} diff --git a/resources_servers/aviary/gsm8k_app.py b/resources_servers/aviary/gsm8k_app.py new file mode 100644 index 0000000000..88c4656008 --- /dev/null +++ b/resources_servers/aviary/gsm8k_app.py @@ -0,0 +1,25 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +from pydantic import Field + +from aviary.envs.gsm8k import CalculatorEnv, GSM8kDataset, GSM8kDatasetSplit +from resources_servers.aviary.app import AviaryResourcesServer + + +class GSM8kResourcesServer(AviaryResourcesServer[CalculatorEnv, GSM8kDataset]): + dataset: GSM8kDataset = Field(default_factory=lambda: GSM8kDataset(GSM8kDatasetSplit.train)) + + +if __name__ == "__main__": + GSM8kResourcesServer.run_webserver() diff --git a/resources_servers/aviary/hotpotqa_app.py b/resources_servers/aviary/hotpotqa_app.py new file mode 100644 index 0000000000..f05ef41430 --- /dev/null +++ b/resources_servers/aviary/hotpotqa_app.py @@ -0,0 +1,25 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +from pydantic import Field + +from aviary.envs.hotpotqa import HotPotQADataset, HotPotQAEnv +from resources_servers.aviary.app import AviaryResourcesServer + + +class HotPotQAResourcesServer(AviaryResourcesServer[HotPotQAEnv, HotPotQADataset]): + dataset: HotPotQADataset = Field(default_factory=lambda: HotPotQADataset(split="train")) + + +if __name__ == "__main__": + HotPotQAResourcesServer.run_webserver() diff --git a/resources_servers/aviary/notebook_app.py b/resources_servers/aviary/notebook_app.py new file mode 100644 index 0000000000..b5072527c1 --- /dev/null +++ b/resources_servers/aviary/notebook_app.py @@ -0,0 +1,184 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import logging +import os +import shutil +import zipfile +from ast import literal_eval +from pathlib import Path +from tempfile import mkdtemp +from typing import cast + +import huggingface_hub +from datasets import Dataset, load_dataset +from pydantic import Field +from tqdm import tqdm + +from aviary.core import EvalAnswerMode, Message, Messages, TaskDataset, Tool, eval_answer +from aviary.envs.notebook import NBEnvironment +from aviary.envs.notebook.utils import NBLanguage +from resources_servers.aviary.app import AviaryResourcesServer + + +logger = logging.getLogger(__name__) + +CAPSULE_DATA_LOCATION = os.getenv("BIXBENCH_CAPSULE_DATA_LOCATION", "~/bixbench_capsules") + + +class BixBenchEnv(NBEnvironment): + def __init__(self, *args, question: str, answer: str, **kwargs): + super().__init__(*args, **kwargs) + self.question = question + self.answer = answer + + # TODO: figure out why we can't pass optional args anymore + async def edit_cell(self, contents: str, idx: int | None) -> str: + """Edit the notebook by modifying a specific code cell. + + ONLY CODE CELLS ARE SUPPORTED. Do no attempt to write Markdown or raw text, + though you are permitted (and encouraged) to write comments in the code cells. + The cell will be automatically rerun if a successful edit is made. + + WRITE SHORT TO MEDIUM LENGTH CELLS FOR EASE OF DEBUGGING. + + Args: + contents: Cell contents to insert. We assume the cell is a code block. + idx: Index of the cell to edit. If None, appends a new cell. + """ + return await super().edit_cell(contents, idx) + + async def reset(self) -> tuple[Messages, list[Tool]]: + obs, tools = await super().reset() + tools.append(Tool.from_function(self.submit_solution)) + + obs = [ + Message( + content="Using the available tools and data, write a jupyter notebook " + f"to solve the following problem: {self.question!r}" + ), + *obs, + ] + + return obs, tools + + async def submit_solution(self, solution: str) -> str: + """Submit your solution (in plain text) to the given task. + + The solution should be based on the analysis you have conducted + in the notebook. Note that this tool may only be called once + and ends the episode. + + Args: + solution: Your solution to the problem + """ + # TODO: support various eval modes + self.state.answer = solution + self.state.done = True + + score = await eval_answer( + proposed=solution, + correct=self.answer, + question=self.question, + eval_mode=EvalAnswerMode.LLM, + llm_eval_config={"name": "gpt-5-mini"}, + ) + correct = score == 1.0 + self.state.total_reward += 1.0 if correct else 0.0 + + return "Correct" if correct else "Incorrect" + + +class BixBenchDataset(TaskDataset[BixBenchEnv]): + """ + Implements the BixBench dataset (https://arxiv.org/abs/2503.00096). + + Each question is turned into an environment containing a Jupyter notebook. + """ + + def __init__(self, split: str = "train"): + bixbench_repo_id = "futurehouse/BixBench" + dataset = cast(Dataset, load_dataset(bixbench_repo_id, split=split)) + self.dataset: list[dict] = [] + for row in dataset: + for question in literal_eval(row["questions"]): + self.dataset.append( + { + "uuid": row["uuid"], + "question": question["question"], + "answer": question["ideal_answer"], + } + ) + + self.capsule_path = Path(CAPSULE_DATA_LOCATION).expanduser() + if not self.capsule_path.exists(): + logger.warning(f"Downloading BixBench dataset to {self.capsule_path}, this may take a while...") + self.capsule_path.mkdir(parents=True, exist_ok=True) + huggingface_hub.snapshot_download( + repo_id=bixbench_repo_id, + repo_type="dataset", + local_dir=self.capsule_path, + ) + logger.warning(f"Extracting BixBench dataset to {self.capsule_path}, this may take a while...") + for path in tqdm(list(self.capsule_path.glob("*.zip")), desc="Extracting"): + with zipfile.ZipFile(path, "r") as zip_ref: + zip_ref.extractall(self.capsule_path) + path.unlink() + + maybe_folder_dir = self.capsule_path / path.stem + if maybe_folder_dir.is_dir(): + # move all contents one level up and remove the folder + for item in maybe_folder_dir.glob("Capsule*"): + shutil.copytree(item, self.capsule_path / item.name) + shutil.rmtree(maybe_folder_dir) + + def __len__(self): + return len(self.dataset) + + def get_new_env_by_idx(self, idx: int) -> BixBenchEnv: + row = self.dataset[idx] + uuid = row["uuid"] + + capsule_path = self.capsule_path / f"CapsuleData-{uuid}" + + notebook_name = NBEnvironment.NOTEBOOK_NAME + # Define local capsule directory + problem_dir = Path(mkdtemp()) + problem_dir.mkdir(parents=True, exist_ok=True) + + # Copy capsule contents to local directory + for item in capsule_path.iterdir(): + if item.is_dir(): + shutil.copytree(item, problem_dir / item.name) + else: + shutil.copy(item, problem_dir) + + nb_path = problem_dir / notebook_name + + return BixBenchEnv( + question=row["question"], + answer=row["answer"], + work_dir=problem_dir, + nb_path=nb_path, + language=NBLanguage.PYTHON, + use_tmp_work_dir=False, + use_docker=True, + ) + + +class BixBenchResourcesServer(AviaryResourcesServer[BixBenchEnv, BixBenchDataset]): + dataset: BixBenchDataset = Field(default_factory=lambda: BixBenchDataset()) + + +if __name__ == "__main__": + BixBenchResourcesServer.run_webserver() diff --git a/resources_servers/aviary/requirements.txt b/resources_servers/aviary/requirements.txt new file mode 100644 index 0000000000..1e4141350c --- /dev/null +++ b/resources_servers/aviary/requirements.txt @@ -0,0 +1,5 @@ +-e nemo-gym[dev] @ ../../ +fhaviary[gsm8k,hotpotqa,notebook,llm]>=0.28.0 +tqdm +datasets +huggingface-hub diff --git a/resources_servers/aviary/schemas.py b/resources_servers/aviary/schemas.py new file mode 100644 index 0000000000..a192ad7d60 --- /dev/null +++ b/resources_servers/aviary/schemas.py @@ -0,0 +1,93 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" +Contains a set of schemas used by both the AviaryResourcesServer and the AviaryAgent. +""" + +from typing import Literal + +from openai.types.responses import FunctionToolParam +from pydantic import BaseModel, ConfigDict + +from nemo_gym.base_resources_server import ( + BaseResourcesServerConfig, + BaseSeedSessionRequest, + BaseSeedSessionResponse, + BaseVerifyRequest, + BaseVerifyResponse, +) +from nemo_gym.openai_utils import ( + NeMoGymEasyInputMessage, + NeMoGymFunctionCallOutput, + NeMoGymResponse, + NeMoGymResponseFunctionToolCall, + NeMoGymResponseOutputItem, +) + + +class AviaryResourcesServerConfig(BaseResourcesServerConfig): + pass + + +class AviarySeedSessionRequest(BaseSeedSessionRequest): + task_idx: int + + +class AviaryEnvStateEasyInputMessage(NeMoGymEasyInputMessage): + """Special subclass so we can identify messages representing environment state.""" + + is_env_state: Literal[True] = True + + +class AviarySeedSessionResponse(BaseSeedSessionResponse): + env_id: str + obs: list[NeMoGymEasyInputMessage | AviaryEnvStateEasyInputMessage] + tools: list[FunctionToolParam] + + +class AviaryStepRequest(BaseModel): + env_id: str + action: list[NeMoGymResponseFunctionToolCall] + + +class AviaryStepResponse(BaseModel): + obs: list[NeMoGymFunctionCallOutput | NeMoGymEasyInputMessage | AviaryEnvStateEasyInputMessage] + reward: float + done: bool + + +class AviaryNeMoGymResponse(NeMoGymResponse): + env_id: str + group_id: str + contains_transitions: Literal[True] = True + output: list[list[NeMoGymResponseOutputItem]] + + +class AviaryAgentVerifyRequest(BaseVerifyRequest): + model_config = ConfigDict(extra="allow") + response: AviaryNeMoGymResponse + + +# Use this MRO so AviaryAgentVerifyRequest.response supersedes BaseVerifyResponse.response +class AviaryAgentVerifyResponse(AviaryAgentVerifyRequest, BaseVerifyResponse): + model_config = ConfigDict(extra="allow") + + +class AviaryCloseRequest(BaseModel): + env_id: str + + +class AviaryCloseResponse(BaseModel): + message: str + success: bool diff --git a/resources_servers/aviary/tests/test_app.py b/resources_servers/aviary/tests/test_app.py new file mode 100644 index 0000000000..d2e184991c --- /dev/null +++ b/resources_servers/aviary/tests/test_app.py @@ -0,0 +1,149 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import json +from unittest.mock import MagicMock, patch + +import httpx +import pytest +from aviary.dataset_server import TaskDatasetServer +from aviary.envs.gsm8k import GSM8kDataset, GSM8kDatasetSplit +from fastapi import Request +from starlette.testclient import TestClient + +from nemo_gym.openai_utils import NeMoGymResponseFunctionToolCall +from nemo_gym.server_utils import ServerClient +from resources_servers.aviary.app import AviaryResourcesServerConfig +from resources_servers.aviary.client_app import AviaryClientResourcesServer, AviaryClientResourcesServerConfig +from resources_servers.aviary.gsm8k_app import GSM8kResourcesServer +from resources_servers.aviary.notebook_app import BixBenchResourcesServer +from resources_servers.aviary.schemas import AviaryCloseRequest, AviarySeedSessionRequest, AviaryStepRequest + + +class TestGSM8kApp: + @pytest.mark.asyncio + async def test_server_lifecycle(self) -> None: + # Create the server + config = AviaryResourcesServerConfig( + name="", + host="0.0.0.0", + port=8080, + entrypoint="", + ) + server = GSM8kResourcesServer(config=config, server_client=MagicMock(spec=ServerClient)) + + # Start an environment + mock_request = MagicMock(spec=Request) + seed_resp = await server.seed_session(mock_request, AviarySeedSessionRequest(task_idx=0)) + + assert seed_resp.obs, "Expected non-empty observations" + assert seed_resp.tools, "Expected non-empty tools" + + # Take a step + action = AviaryStepRequest( + env_id=seed_resp.env_id, + action=[ + NeMoGymResponseFunctionToolCall( + call_id="abc123", + name="calculator", + arguments=json.dumps({"expr": "1 + 1"}), + ) + ], + ) + step_resp = await server.step(mock_request, action) + + assert len(step_resp.obs) == 1, "Expected 1 observation" + assert step_resp.obs[0].output == "2" + + +class TestNotebookApp: + @pytest.mark.skip(reason="Skipping notebook app tests - requires data download and Docker") + @pytest.mark.asyncio + async def test_server_lifecycle(self) -> None: + # Create the server + config = AviaryResourcesServerConfig( + name="", + host="0.0.0.0", + port=8080, + entrypoint="", + ) + server = BixBenchResourcesServer(config=config, server_client=MagicMock(spec=ServerClient)) + + # Start an environment + # NOTE: this spins up a docker container, so should probably skip in CI + mock_request = MagicMock(spec=Request) + seed_resp = await server.seed_session(mock_request, AviarySeedSessionRequest(task_idx=0)) + + assert seed_resp.obs, "Expected non-empty observations" + assert seed_resp.tools, "Expected non-empty tools" + + # Take a step + action = AviaryStepRequest( + env_id=seed_resp.env_id, + action=[ + NeMoGymResponseFunctionToolCall( + call_id="abc123", + name="submit_solution", + arguments=json.dumps({"solution": "print('Hello, world!')"}), + ) + ], + ) + step_resp = await server.step(mock_request, action) + + assert len(step_resp.obs) == 2, "Expected 2 observations" + assert step_resp.done, "Expected done" + + # Close the environment + close_resp = await server.close(mock_request, AviaryCloseRequest(env_id=seed_resp.env_id)) + assert close_resp.success, "Expected success" + assert not server.env_id_to_env, "Expected no environments" + + +class TestClientApp: + @pytest.mark.asyncio + async def test_client_server_lifecycle(self) -> None: + dataset = GSM8kDataset(GSM8kDatasetSplit.train) + task_server = TaskDatasetServer(dataset=dataset, port=8042) + + test_client = TestClient(task_server.app) + + def create_async_client(*args, **kwargs): + return httpx.AsyncClient(transport=httpx.ASGITransport(app=task_server.app), base_url="http://testserver") + + with ( + patch("httpx.Client") as mock_client_class, + patch("httpx_aiohttp.HttpxAiohttpClient", side_effect=create_async_client), + ): + mock_client_class.return_value = test_client + + config = AviaryClientResourcesServerConfig( + name="", + host="0.0.0.0", + port=8081, + entrypoint="", + server_url="http://testserver", + ) + client_server = AviaryClientResourcesServer( + config=config, + server_client=MagicMock(spec=ServerClient), + ) + + mock_request = MagicMock(spec=Request) + seed_resp = await client_server.seed_session(mock_request, AviarySeedSessionRequest(task_idx=0)) + + assert seed_resp.obs, "Expected non-empty observations" + assert seed_resp.tools, "Expected non-empty tools" + assert seed_resp.env_id, "Expected environment ID" + + assert len(task_server.envs) == 1, "Expected 1 environment in TaskDatasetServer" + assert len(client_server.env_id_to_env) == 1, "Expected 1 environment in AviaryClientResourcesServer" diff --git a/responses_api_agents/aviary_agent/README.md b/responses_api_agents/aviary_agent/README.md new file mode 100644 index 0000000000..6a2b17d0a6 --- /dev/null +++ b/responses_api_agents/aviary_agent/README.md @@ -0,0 +1,14 @@ +# Description + +[Aviary](https://github.com/Future-House/aviary) is a framework built by [FutureHouse](https://www.futurehouse.org/) for developing tools and environments for LLM agents. This integration allows NeMo Gym rollouts over Aviary environments and datasets. + +This agent is intended to be used with the resource servers implemented in `resource_servers/aviary`, which provide the environment and tool interfaces to underlying Aviary implementations. + + +# Licensing information +Code: Apache 2.0 +Data: N/A + +Dependencies +- nemo_gym: Apache 2.0 +- aviary: Apache 2.0 diff --git a/responses_api_agents/aviary_agent/app.py b/responses_api_agents/aviary_agent/app.py new file mode 100644 index 0000000000..f515de08eb --- /dev/null +++ b/responses_api_agents/aviary_agent/app.py @@ -0,0 +1,258 @@ +# Copyright (c) 2025, NVIDIA CORPORATION, PLACEHOLDER. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import json +import logging +from collections.abc import Sequence +from typing import List, cast + +import aiohttp +from pydantic import ConfigDict, Field, ValidationError +from tenacity import retry, stop_after_attempt, wait_exponential_jitter + +from nemo_gym.base_resources_server import BaseRunRequest +from nemo_gym.base_responses_api_agent import BaseResponsesAPIAgentConfig, SimpleResponsesAPIAgent +from nemo_gym.config_types import ModelServerRef, ResourcesServerRef +from nemo_gym.openai_utils import ( + NeMoGymEasyInputMessage, + NeMoGymFunctionCallOutput, + NeMoGymResponse, + NeMoGymResponseCreateParamsNonStreaming, + NeMoGymResponseFunctionToolCall, + NeMoGymResponseInput, + NeMoGymResponseOutputMessage, +) +from resources_servers.aviary.schemas import ( + AviaryAgentVerifyRequest, + AviaryAgentVerifyResponse, + AviaryEnvStateEasyInputMessage, + AviaryNeMoGymResponse, + AviarySeedSessionResponse, + AviaryStepResponse, +) + + +logger = logging.getLogger(__name__) + + +class AviaryAgentConfig(BaseResponsesAPIAgentConfig): + resources_server: ResourcesServerRef + model_server: ModelServerRef + + max_steps: int | None = Field( + default=None, + description="The maximum number of steps to take in the environment. " + "If not set, the agent will run indefinitely.", + ) + + # Doesn't cause an issue if not set, but if it is, then + # we can avoid sending requests that are guaranteed to + # exceed the limit. If not set, vLLM will reject the request + # for us (but also clutter logs with exceptions). + # TODO: see if we can retrieve this from /models endpoint + max_total_sequence_length: int | None = Field( + default=None, + description="If set, the rollout will stop when the agent state exceeds this length. " + "If not set, will rely on a vLLM exception to tell us when we've exceeded the model's " + "token limit. Setting this simply avoids that exception.", + ) + + done_if_no_tool_calls: bool = Field( + default=True, description="If True, end the rollout if the model does not call any tools." + ) + + collapse_old_env_states: bool = Field( + default=False, + description="If True, collapse previous Aviary EnvStateMessages into a hidden message " + "in the agent state. Can be used to compress the context, if supported by the environment.", + ) + old_env_state_message: str = Field( + default="[Previous environment state - hidden]", + description="The message to use when collapsing previous Aviary EnvStateMessages into a hidden message.", + ) + + +class AviaryAgentRunRequest(BaseRunRequest): + model_config = ConfigDict(extra="allow") + + task_idx: int + responses_create_params: NeMoGymResponseCreateParamsNonStreaming = Field( + default_factory=lambda: NeMoGymResponseCreateParamsNonStreaming(input=[]) + ) + + +class AviaryAgent(SimpleResponsesAPIAgent): + config: AviaryAgentConfig + + def update_agent_state( + self, + agent_state: NeMoGymResponseCreateParamsNonStreaming, + model_output: list[NeMoGymResponseOutputMessage], + obs: list[NeMoGymEasyInputMessage | NeMoGymFunctionCallOutput], + successful_transition: bool, + ) -> NeMoGymResponseCreateParamsNonStreaming: + """Update the agent state. + + Separate method so subclasses can override. + """ + + prev_messages = agent_state.input + if successful_transition and self.config.collapse_old_env_states: + # only collapse if we had a successful transition - otherwise we'd be hiding previous + # env state without supplying a new one in obs. + hidden_message = NeMoGymEasyInputMessage(role="user", content=self.config.old_env_state_message) + prev_messages = [ + hidden_message if isinstance(m, AviaryEnvStateEasyInputMessage) else m for m in prev_messages + ] + + return agent_state.model_copy(update={"input": prev_messages + model_output + obs}) + + @retry(stop=stop_after_attempt(3), wait=wait_exponential_jitter(initial=5)) + async def _seed_session(self, task_idx: int) -> AviarySeedSessionResponse: + reset_response = await self.server_client.post( + server_name=self.config.resources_server.name, + url_path="/seed_session", + json={"task_idx": task_idx}, + ) + reset_response.raise_for_status() + seed_session_response = AviarySeedSessionResponse.model_validate(await reset_response.json()) + if not seed_session_response.obs: + raise ValueError("No observations in seed session response") + return seed_session_response + + async def responses(self, req: AviaryAgentRunRequest) -> AviaryNeMoGymResponse: + req = req.model_copy(deep=True) + body = req.responses_create_params + + if isinstance(body.input, str): + body.input = [NeMoGymEasyInputMessage(role="user", content=body.input)] + + seed_session_response = await self._seed_session(req.task_idx) + + agent_state = body.model_copy( + update={"input": body.input + seed_session_response.obs, "tools": seed_session_response.tools} + ) + + env_id = seed_session_response.env_id + model_response: NeMoGymResponse | None = None + agent_state_history: list[NeMoGymResponseInput] = [] + model_server_cookies = None + + step = 0 + try: + while True: + if self.config.max_steps is not None and step >= self.config.max_steps: + break + step += 1 + successful_transition = True + + # Sample action from model + try: + raw_model_response = await self.server_client.post( + server_name=self.config.model_server.name, + url_path="/v1/responses", + json=agent_state, + cookies=model_server_cookies, + ) + raw_model_response.raise_for_status() + model_server_cookies = raw_model_response.cookies + model_response_json = await raw_model_response.json() + except (json.JSONDecodeError, aiohttp.ClientResponseError) as e: + # JSONDecodeError will be thrown if there's an underlying openai error. + # For now, we break. Default reward of 0 will be returned when /verify is called. + logger.warning(f"Error calling /v1/responses: {e!r}. Response: {raw_model_response.text!r}.") + break + + try: + model_response = NeMoGymResponse.model_validate(model_response_json) + except ValidationError as e: + logger.warning(f"Error validating model response: {e!r}. Response: {model_response_json!r}.") + break + + # Parse model response + model_output = model_response.output + all_fn_calls: List[NeMoGymResponseFunctionToolCall] = [ + o for o in model_output if o.type == "function_call" + ] + all_output_messages: List[NeMoGymResponseOutputMessage] = [ + o for o in model_output if o.type == "message" and o.role == "assistant" + ] + done = False + + if not all_fn_calls and all_output_messages: + if self.config.done_if_no_tool_calls: + done = True + obs = [] + else: + # Got non-tool-call outputs, so ask the model to try again. + obs: Sequence[NeMoGymEasyInputMessage | NeMoGymFunctionCallOutput] = [ + NeMoGymEasyInputMessage( + role="user", + content="You did not respond with a valid tool call. " + "This may mean you did not call tools, or you tried to " + "and got the formatting, tool name, or arguments " + "wrong. To proceed, please call at least one tool.", + ) + ] + successful_transition = False + else: + # Apply action to environment + raw_env_response = await self.server_client.post( + server_name=self.config.resources_server.name, + url_path="/step", + json={"action": [c.model_dump(mode="json") for c in all_fn_calls], "env_id": env_id}, + ) + env_response = AviaryStepResponse.model_validate(await raw_env_response.json()) + obs = env_response.obs + done = env_response.done + + agent_state = self.update_agent_state(agent_state, model_output, obs, successful_transition) + agent_state_history.append(cast(NeMoGymResponseInput, agent_state.input)) + + if done: + break + + finally: + await self.server_client.post( + server_name=self.config.resources_server.name, url_path="/close", json={"env_id": env_id} + ) + + assert model_response is not None, ( + "Rollout crashed or terminated before first transition completed, cannot proceed." + ) + + output = AviaryNeMoGymResponse.model_validate( + model_response.model_dump() + | {"output": agent_state_history, "env_id": env_id, "group_id": str(req.task_idx)} + ) + return output + + async def run(self, body: AviaryAgentRunRequest) -> AviaryAgentVerifyResponse: + try: + response = await self.responses(body) + + verify_request = AviaryAgentVerifyRequest.model_validate(body.model_dump() | {"response": response}) + verify_response = await self.server_client.post( + server_name=self.config.resources_server.name, + url_path="/verify", + json=verify_request.model_dump(), + ) + + return AviaryAgentVerifyResponse.model_validate(await verify_response.json()) + except Exception as e: + logger.exception("Error in run") + raise e + + +if __name__ == "__main__": + AviaryAgent.run_webserver() diff --git a/responses_api_agents/aviary_agent/requirements.txt b/responses_api_agents/aviary_agent/requirements.txt new file mode 100644 index 0000000000..800b1d401f --- /dev/null +++ b/responses_api_agents/aviary_agent/requirements.txt @@ -0,0 +1,3 @@ +-e nemo-gym[dev] @ ../../ +fhaviary +tenacity diff --git a/responses_api_agents/aviary_agent/tests/test_app.py b/responses_api_agents/aviary_agent/tests/test_app.py new file mode 100644 index 0000000000..a76af567b3 --- /dev/null +++ b/responses_api_agents/aviary_agent/tests/test_app.py @@ -0,0 +1,516 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import json +import uuid +from unittest.mock import AsyncMock, MagicMock, call + +from fastapi.testclient import TestClient + +from nemo_gym.openai_utils import ( + NeMoGymEasyInputMessage, + NeMoGymResponseCreateParamsNonStreaming, + NeMoGymResponseFunctionToolCall, +) +from nemo_gym.server_utils import ServerClient +from responses_api_agents.aviary_agent.app import ( + AviaryAgent, + AviaryAgentConfig, + AviaryAgentRunRequest, + ModelServerRef, + ResourcesServerRef, +) + + +class TestApp: + def test_lifecycle(self) -> None: + config = AviaryAgentConfig( + host="0.0.0.0", + port=8080, + entrypoint="", + name="", + model_server=ModelServerRef( + type="responses_api_models", + name="my model name", + ), + resources_server=ResourcesServerRef( + type="resources_servers", + name="my resources name", + ), + max_steps=1, + ) + server = AviaryAgent(config=config, server_client=MagicMock(spec=ServerClient)) + app = server.setup_webserver() + client = TestClient(app) + + mock_seed_session_data = { + "env_id": str(uuid.uuid4()), + "obs": [{"role": "user", "content": "world"}], + "tools": [], + } + mock_response_data = { + "id": "resp_688babb004988199b26c5250ba69c1e80abdf302bcd600d3", + "created_at": 1753983920.0, + "model": "dummy_model", + "object": "response", + "output": [ + NeMoGymResponseFunctionToolCall( + call_id="abc123", + name="get_weather", + arguments=json.dumps({"city": "San Francisco"}), + ).model_dump(), + ], + "parallel_tool_calls": True, + "tool_choice": "auto", + "tools": [], + } + mock_step_data = { + "obs": [{"role": "user", "content": "hello"}], + "reward": 0.0, + "done": False, + } + dotjson_mock = AsyncMock() + dotjson_mock.json.side_effect = [mock_seed_session_data, mock_response_data, mock_step_data] + dotjson_mock.raise_for_status = MagicMock() + dotjson_mock.cookies = None + server.server_client.post = AsyncMock(return_value=dotjson_mock) + + # No model provided should use the one from the config + res_no_model = client.post( + "/v1/responses", + json={ + "task_idx": 0, + "responses_create_params": {"input": [{"role": "user", "content": "hello"}]}, + }, + ) + assert res_no_model.status_code == 200 + + calls = server.server_client.post.await_args_list + assert len(calls) == 4 + + assert calls[0] == call(server_name="my resources name", url_path="/seed_session", json={"task_idx": 0}) + + assert calls[1][1]["server_name"] == "my model name" + assert calls[1][1]["url_path"] == "/v1/responses" + model_input = calls[1][1]["json"].input + assert len(model_input) == 2 + assert model_input[0].content == "hello" + assert model_input[1].content == "world" + + assert calls[2][1]["server_name"] == "my resources name" + assert calls[2][1]["url_path"] == "/step" + assert calls[2][1]["json"]["action"][0]["call_id"] == "abc123" + assert calls[2][1]["json"]["env_id"] == mock_seed_session_data["env_id"] + + assert calls[3] == call( + server_name="my resources name", url_path="/close", json={"env_id": mock_seed_session_data["env_id"]} + ) + + def test_sanity(self) -> None: + config = AviaryAgentConfig( + host="0.0.0.0", + port=8080, + entrypoint="", + name="", + resources_server=ResourcesServerRef( + type="resources_servers", + name="", + ), + model_server=ModelServerRef( + type="responses_api_models", + name="", + ), + ) + AviaryAgent(config=config, server_client=MagicMock(spec=ServerClient)) + + async def test_responses_done_if_no_tool_calls_true(self) -> None: + config = AviaryAgentConfig( + host="0.0.0.0", + port=8080, + entrypoint="", + name="", + model_server=ModelServerRef( + type="responses_api_models", + name="my model name", + ), + resources_server=ResourcesServerRef( + type="resources_servers", + name="my resources name", + ), + done_if_no_tool_calls=True, + ) + agent = AviaryAgent(config=config, server_client=MagicMock(spec=ServerClient)) + + mock_seed_session_data = { + "env_id": str(uuid.uuid4()), + "obs": [{"role": "user", "content": "Initial observation"}], + "tools": [], + } + mock_response_data = { + "id": "resp_123", + "created_at": 1753983920.0, + "model": "dummy_model", + "object": "response", + "output": [ + { + "id": "msg_123", + "content": [{"annotations": [], "text": "I don't need to call any tools", "type": "output_text"}], + "role": "assistant", + "status": "completed", + "type": "message", + } + ], + "parallel_tool_calls": True, + "tool_choice": "auto", + "tools": [], + } + mock_close_data = {"message": "Success", "success": True} + + dotjson_mock = AsyncMock() + dotjson_mock.json.side_effect = [mock_seed_session_data, mock_response_data, mock_close_data] + dotjson_mock.raise_for_status = MagicMock() + dotjson_mock.cookies = None + agent.server_client.post = AsyncMock(return_value=dotjson_mock) + + request = AviaryAgentRunRequest( + task_idx=0, responses_create_params=NeMoGymResponseCreateParamsNonStreaming(input=[]) + ) + response = await agent.responses(request) + + assert response.env_id == mock_seed_session_data["env_id"] + assert response.group_id == "0" + + agent.server_client.post.assert_has_awaits( + [ + call(server_name="my resources name", url_path="/seed_session", json={"task_idx": 0}), + call( + server_name="my model name", + url_path="/v1/responses", + json=NeMoGymResponseCreateParamsNonStreaming.model_validate( + {"input": [{"role": "user", "content": "Initial observation"}], "tools": []} + ), + cookies=None, + ), + call( + server_name="my resources name", + url_path="/close", + json={"env_id": mock_seed_session_data["env_id"]}, + ), + ] + ) + + async def test_responses_multi_step(self) -> None: + config = AviaryAgentConfig( + host="0.0.0.0", + port=8080, + entrypoint="", + name="", + model_server=ModelServerRef( + type="responses_api_models", + name="my model name", + ), + resources_server=ResourcesServerRef( + type="resources_servers", + name="my resources name", + ), + ) + agent = AviaryAgent(config=config, server_client=MagicMock(spec=ServerClient)) + + env_id = str(uuid.uuid4()) + mock_seed_session_data = {"env_id": env_id, "obs": [{"role": "user", "content": "Step 0"}], "tools": []} + + mock_response_1 = { + "id": "resp_1", + "created_at": 1753983920.0, + "model": "dummy_model", + "object": "response", + "output": [ + NeMoGymResponseFunctionToolCall( + call_id="call_1", name="tool_1", arguments=json.dumps({"arg": "val1"}) + ).model_dump() + ], + "parallel_tool_calls": True, + "tool_choice": "auto", + "tools": [], + } + mock_step_1 = { + "obs": [{"type": "function_call_output", "call_id": "call_1", "output": "Result 1"}], + "reward": 0.0, + "done": False, + } + + mock_response_2 = { + "id": "resp_2", + "created_at": 1753983921.0, + "model": "dummy_model", + "object": "response", + "output": [ + NeMoGymResponseFunctionToolCall( + call_id="call_2", name="tool_2", arguments=json.dumps({"arg": "val2"}) + ).model_dump() + ], + "parallel_tool_calls": True, + "tool_choice": "auto", + "tools": [], + } + mock_step_2 = { + "obs": [{"type": "function_call_output", "call_id": "call_2", "output": "Result 2"}], + "reward": 1.0, + "done": True, + } + + mock_close_data = {"message": "Success", "success": True} + + dotjson_mock = AsyncMock() + dotjson_mock.json.side_effect = [ + mock_seed_session_data, + mock_response_1, + mock_step_1, + mock_response_2, + mock_step_2, + mock_close_data, + ] + dotjson_mock.raise_for_status = MagicMock() + dotjson_mock.cookies = MagicMock() + agent.server_client.post = AsyncMock(return_value=dotjson_mock) + + request = AviaryAgentRunRequest( + task_idx=42, responses_create_params=NeMoGymResponseCreateParamsNonStreaming(input=[]) + ) + response = await agent.responses(request) + + assert response.env_id == env_id + assert response.group_id == "42" + assert len(response.output) == 2 + + calls = agent.server_client.post.await_args_list + assert len(calls) == 6 + assert calls[0] == call(server_name="my resources name", url_path="/seed_session", json={"task_idx": 42}) + assert calls[1][1]["server_name"] == "my model name" + assert calls[1][1]["url_path"] == "/v1/responses" + assert calls[2][1]["server_name"] == "my resources name" + assert calls[2][1]["url_path"] == "/step" + assert calls[2][1]["json"]["action"][0]["call_id"] == "call_1" + assert calls[3][1]["server_name"] == "my model name" + assert calls[3][1]["url_path"] == "/v1/responses" + assert calls[4][1]["server_name"] == "my resources name" + assert calls[4][1]["url_path"] == "/step" + assert calls[4][1]["json"]["action"][0]["call_id"] == "call_2" + assert calls[5] == call(server_name="my resources name", url_path="/close", json={"env_id": env_id}) + + async def test_responses_collapse_old_env_states(self) -> None: + config = AviaryAgentConfig( + host="0.0.0.0", + port=8080, + entrypoint="", + name="", + model_server=ModelServerRef( + type="responses_api_models", + name="my model name", + ), + resources_server=ResourcesServerRef( + type="resources_servers", + name="my resources name", + ), + collapse_old_env_states=True, + old_env_state_message="[Hidden env state]", + ) + agent = AviaryAgent(config=config, server_client=MagicMock(spec=ServerClient)) + + env_id = str(uuid.uuid4()) + mock_seed_session_data = { + "env_id": env_id, + "obs": [{"role": "user", "content": "Initial env state", "is_env_state": True}], + "tools": [], + } + + mock_response_1 = { + "id": "resp_1", + "created_at": 1753983920.0, + "model": "dummy_model", + "object": "response", + "output": [ + NeMoGymResponseFunctionToolCall( + call_id="call_1", name="tool_1", arguments=json.dumps({"arg": "val"}) + ).model_dump() + ], + "parallel_tool_calls": True, + "tool_choice": "auto", + "tools": [], + } + mock_step_1 = { + "obs": [{"role": "user", "content": "New env state after step 1", "is_env_state": True}], + "reward": 0.0, + "done": False, + } + + mock_response_2 = { + "id": "resp_2", + "created_at": 1753983921.0, + "model": "dummy_model", + "object": "response", + "output": [ + NeMoGymResponseFunctionToolCall( + call_id="call_2", name="tool_2", arguments=json.dumps({"arg": "val2"}) + ).model_dump() + ], + "parallel_tool_calls": True, + "tool_choice": "auto", + "tools": [], + } + mock_step_2 = {"obs": [{"role": "user", "content": "Final observation"}], "reward": 1.0, "done": True} + + mock_close_data = {"message": "Success", "success": True} + + dotjson_mock = AsyncMock() + dotjson_mock.json.side_effect = [ + mock_seed_session_data, + mock_response_1, + mock_step_1, + mock_response_2, + mock_step_2, + mock_close_data, + ] + dotjson_mock.raise_for_status = MagicMock() + dotjson_mock.cookies = None + agent.server_client.post = AsyncMock(return_value=dotjson_mock) + + request = AviaryAgentRunRequest( + task_idx=0, responses_create_params=NeMoGymResponseCreateParamsNonStreaming(input=[]) + ) + await agent.responses(request) + + calls = agent.server_client.post.await_args_list + second_model_call = calls[3] + second_model_input = second_model_call[1]["json"].input + + has_hidden_message = any( + isinstance(msg, NeMoGymEasyInputMessage) and msg.content == "[Hidden env state]" + for msg in second_model_input + ) + assert has_hidden_message, "Old env state should be collapsed to hidden message" + + old_env_state_present = any( + isinstance(msg, dict) and msg.get("content") == "Initial env state" for msg in second_model_input + ) + assert not old_env_state_present, "Old env state content should not be present" + + async def test_run_workflow(self) -> None: + config = AviaryAgentConfig( + host="0.0.0.0", + port=8080, + entrypoint="", + name="", + model_server=ModelServerRef( + type="responses_api_models", + name="my model name", + ), + resources_server=ResourcesServerRef( + type="resources_servers", + name="my resources name", + ), + ) + agent = AviaryAgent(config=config, server_client=MagicMock(spec=ServerClient)) + + env_id = str(uuid.uuid4()) + mock_seed_session_data = {"env_id": env_id, "obs": [{"role": "user", "content": "obs"}], "tools": []} + + mock_response_data = { + "id": "resp_1", + "created_at": 1753983920.0, + "model": "dummy_model", + "object": "response", + "output": [ + NeMoGymResponseFunctionToolCall( + call_id="call_1", name="tool_1", arguments=json.dumps({"arg": "val"}) + ).model_dump() + ], + "parallel_tool_calls": True, + "tool_choice": "auto", + "tools": [], + } + mock_step_data = { + "obs": [{"type": "function_call_output", "call_id": "call_1", "output": "result"}], + "reward": 1.0, + "done": True, + } + mock_verify_data = { + "reward": 1.0, + "success": True, + "task_idx": 0, + "responses_create_params": {"input": []}, + "response": { + "id": "resp_1", + "created_at": 1753983920.0, + "model": "dummy_model", + "object": "response", + "env_id": env_id, + "group_id": "0", + "contains_transitions": True, + "output": [[{"role": "user", "content": "obs"}]], + "parallel_tool_calls": True, + "tool_choice": "auto", + "tools": [], + }, + } + + dotjson_mock = AsyncMock() + dotjson_mock.json.side_effect = [ + mock_seed_session_data, + mock_response_data, + mock_step_data, + mock_verify_data, + ] + dotjson_mock.raise_for_status = MagicMock() + dotjson_mock.cookies = None + agent.server_client.post = AsyncMock(return_value=dotjson_mock) + + request = AviaryAgentRunRequest( + task_idx=0, responses_create_params=NeMoGymResponseCreateParamsNonStreaming(input=[]) + ) + verify_response = await agent.run(request) + + assert verify_response.reward == 1.0 + assert verify_response.success is True + + agent.server_client.post.assert_has_awaits( + [ + call(server_name="my resources name", url_path="/seed_session", json={"task_idx": 0}), + call( + server_name="my model name", + url_path="/v1/responses", + json=NeMoGymResponseCreateParamsNonStreaming.model_validate( + {"input": [{"role": "user", "content": "obs"}], "tools": []} + ), + cookies=None, + ), + call( + server_name="my resources name", + url_path="/step", + json={ + "action": [ + { + "call_id": "call_1", + "name": "tool_1", + "arguments": '{"arg": "val"}', + "type": "function_call", + "id": None, + "status": None, + } + ], + "env_id": env_id, + }, + ), + call(server_name="my resources name", url_path="/close", json={"env_id": env_id}), + ] + )