diff --git a/responses_api_models/azure_openai_model/README.md b/responses_api_models/azure_openai_model/README.md new file mode 100644 index 000000000..7990ee627 --- /dev/null +++ b/responses_api_models/azure_openai_model/README.md @@ -0,0 +1,48 @@ +# Azure OpenAI Model + +Use this model server to access Azure OpenAI endpoints for LLM-as-a-judge. +## Configuration + +Set up your `env.yaml` file: +```yaml +policy_base_url: https://my.end.point.com/v1/azure +policy_api_key: +policy_model_name: gpt-5-nano +policy_api_version: 2024-10-21 +``` + +## Usage + +### Running the server +```bash +config_paths="responses_api_models/azure_openai_model/configs/azure_openai_model.yaml, \ +resources_servers/equivalence_llm_judge/configs/equivalence_llm_judge.yaml" + +ng_run "+config_paths=[${config_paths}]" \ ++policy_model.responses_api_models.azure_openai_model.default_query.api-version= +``` + +### Collecting Rollouts + +```bash +ng_collect_rollouts \ + +agent_name=equivalence_llm_judge_simple_agent \ + +input_jsonl_fpath=resources_servers/equivalence_llm_judge/data/example.jsonl \ + +output_jsonl_fpath=results/example_rollouts.jsonl \ + +limit=5 +``` + +### Test cases + +```bash +ng_test +entrypoint=responses_api_models/azure_openai_model +``` + +## Licensing information + +- **Code**: Apache 2.0 +- **Data**: N/A + +## Dependencies + +- `nemo_gym`: Apache 2.0 diff --git a/responses_api_models/azure_openai_model/app.py b/responses_api_models/azure_openai_model/app.py new file mode 100644 index 000000000..4f63ac1fa --- /dev/null +++ b/responses_api_models/azure_openai_model/app.py @@ -0,0 +1,104 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +from asyncio import Semaphore +from time import time +from uuid import uuid4 + +from fastapi import Request +from openai import AsyncAzureOpenAI + +from nemo_gym.base_responses_api_model import ( + BaseResponsesAPIModelConfig, + Body, + SimpleResponsesAPIModel, +) +from nemo_gym.openai_utils import ( + NeMoGymChatCompletion, + NeMoGymChatCompletionCreateParamsNonStreaming, + NeMoGymResponse, + NeMoGymResponseCreateParamsNonStreaming, +) +from responses_api_models.vllm_model.app import VLLMConverter + + +class AzureOpenAIModelServerConfig(BaseResponsesAPIModelConfig): + openai_base_url: str + openai_api_key: str + openai_model: str + default_query: dict + num_concurrent_requests: int + + +class AzureOpenAIModelServer(SimpleResponsesAPIModel): + config: AzureOpenAIModelServerConfig + + def model_post_init(self, context): + self._client = AsyncAzureOpenAI( + azure_endpoint=self.config.openai_base_url, + api_key=self.config.openai_api_key, + api_version=self.config.default_query.get("api-version"), + ) + self._converter = VLLMConverter(return_token_id_information=False) + self._semaphore: Semaphore = Semaphore(self.config.num_concurrent_requests) + return super().model_post_init(context) + + async def responses( + self, request: Request, body: NeMoGymResponseCreateParamsNonStreaming = Body() + ) -> NeMoGymResponse: + async with self._semaphore: + chat_completion_create_params = self._converter.responses_to_chat_completion_create_params(body) + chat_completion_params_dict = chat_completion_create_params.model_dump(exclude_unset=True) + chat_completion_params_dict.setdefault("model", self.config.openai_model) + chat_completion_response = await self._client.chat.completions.create(**chat_completion_params_dict) + + choice = chat_completion_response.choices[0] + response_output = self._converter.postprocess_chat_response(choice) + response_output_dicts = [item.model_dump() for item in response_output] + return NeMoGymResponse( + id=f"resp_{uuid4().hex}", + created_at=int(time()), + model=self.config.openai_model, + object="response", + output=response_output_dicts, + tool_choice=body.tool_choice if "tool_choice" in body else "auto", + parallel_tool_calls=body.parallel_tool_calls, + tools=body.tools, + temperature=body.temperature, + top_p=body.top_p, + background=body.background, + max_output_tokens=body.max_output_tokens, + max_tool_calls=body.max_tool_calls, + previous_response_id=body.previous_response_id, + prompt=body.prompt, + reasoning=body.reasoning, + service_tier=body.service_tier, + text=body.text, + top_logprobs=body.top_logprobs, + truncation=body.truncation, + metadata=body.metadata, + instructions=body.instructions, + user=body.user, + ) + + async def chat_completions( + self, request: Request, body: NeMoGymChatCompletionCreateParamsNonStreaming = Body() + ) -> NeMoGymChatCompletion: + body_dict = body.model_dump(exclude_unset=True) + body_dict.setdefault("model", self.config.openai_model) + openai_response_dict = await self._client.chat.completions.create(**body_dict) + return NeMoGymChatCompletion.model_validate(openai_response_dict) + + +if __name__ == "__main__": + AzureOpenAIModelServer.run_webserver() diff --git a/responses_api_models/azure_openai_model/client.py b/responses_api_models/azure_openai_model/client.py new file mode 100644 index 000000000..dd107ee20 --- /dev/null +++ b/responses_api_models/azure_openai_model/client.py @@ -0,0 +1,40 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +from asyncio import run + +from nemo_gym.server_utils import ServerClient + + +server_client = ServerClient.load_from_global_config() + + +async def main(): + task_1 = await server_client.post( + server_name="policy_model", + url_path="/v1/responses", + json={"input": "hello"}, + ) + task_2 = await server_client.post( + server_name="policy_model", + url_path="/v1/chat/completions", + json={ + "messages": [{"role": "user", "content": "hello"}], + }, + ) + print(await task_1.json()) + print(await task_2.json()) + + +if __name__ == "__main__": + run(main()) diff --git a/responses_api_models/azure_openai_model/configs/azure_openai_model.yaml b/responses_api_models/azure_openai_model/configs/azure_openai_model.yaml new file mode 100644 index 000000000..cb3e6878a --- /dev/null +++ b/responses_api_models/azure_openai_model/configs/azure_openai_model.yaml @@ -0,0 +1,11 @@ +policy_model: + responses_api_models: + azure_openai_model: + entrypoint: app.py + openai_base_url: ${policy_base_url} + openai_api_key: ${policy_api_key} + openai_model: ${policy_model_name} + # for azure openai only + default_query: + api-version: ??? + num_concurrent_requests: 8 diff --git a/responses_api_models/azure_openai_model/requirements.txt b/responses_api_models/azure_openai_model/requirements.txt new file mode 100644 index 000000000..00ed83213 --- /dev/null +++ b/responses_api_models/azure_openai_model/requirements.txt @@ -0,0 +1 @@ +-e nemo-gym[dev] @ ../../ diff --git a/responses_api_models/azure_openai_model/tests/test_app.py b/responses_api_models/azure_openai_model/tests/test_app.py new file mode 100644 index 000000000..6ef4bb002 --- /dev/null +++ b/responses_api_models/azure_openai_model/tests/test_app.py @@ -0,0 +1,178 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +from unittest.mock import AsyncMock, MagicMock + +from fastapi.testclient import TestClient +from openai import AsyncAzureOpenAI +from pytest import MonkeyPatch + +from nemo_gym.openai_utils import ( + NeMoGymChatCompletion, + NeMoGymChatCompletionMessage, + NeMoGymChoice, + NeMoGymResponse, + NeMoGymResponseCreateParamsNonStreaming, + NeMoGymResponseOutputMessage, +) +from nemo_gym.server_utils import ServerClient +from responses_api_models.azure_openai_model.app import ( + AzureOpenAIModelServer, + AzureOpenAIModelServerConfig, +) + + +# Used for mocking created_at timestamp generation +FIXED_TIME = 1691418000 +FIXED_UUID = "123" + + +class FakeUUID: + """Used for mocking UUIDs""" + + hex = FIXED_UUID + + +class TestApp: + def _setup_server(self, monkeypatch=None): + config = AzureOpenAIModelServerConfig( + host="0.0.0.0", + port=8081, + openai_base_url="https://prod.api.nvidia.com/llm/v1/azure", + openai_api_key="dummy_key", # pragma: allowlist secret + openai_model="dummy_model", + default_query={"api-version": "dummy_version"}, + num_concurrent_requests=8, + entrypoint="", + name="", + ) + return AzureOpenAIModelServer(config=config, server_client=MagicMock(spec=ServerClient)) + + async def test_sanity(self) -> None: + self._setup_server() + + async def test_chat_completions(self, monkeypatch: MonkeyPatch) -> None: + server = self._setup_server() + app = server.setup_webserver() + client = TestClient(app) + + mock_chat_data = { + "id": "chatcmpl-BzRdCFjIEIp59xXLBNYjdPPrcpDaa", # pragma: allowlist secret + "choices": [ + { + "finish_reason": "stop", + "index": 0, + "message": { + "content": "Hello! How can I help you today?", + "role": "assistant", + }, + } + ], + "created": 1753983922, + "model": "dummy_model", + "object": "chat.completion", + } + + called_args_chat = {} + + async def mock_create_chat(**kwargs): + nonlocal called_args_chat + called_args_chat = kwargs + return mock_chat_data + + server._client = MagicMock(spec=AsyncAzureOpenAI) + server._client.chat.completions.create = AsyncMock(side_effect=mock_create_chat) + + chat_no_model = client.post( + "/v1/chat/completions", + json={"messages": [{"role": "user", "content": "hi"}]}, + ) + assert chat_no_model.status_code == 200 + assert called_args_chat.get("model") == "dummy_model" + + chat_with_model = client.post( + "/v1/chat/completions", + json={ + "messages": [{"role": "user", "content": "hi"}], + "model": "override_model", + }, + ) + assert chat_with_model.status_code == 200 + assert called_args_chat.get("model") == "override_model" + + server._client.chat.completions.create.assert_any_await( + messages=[{"role": "user", "content": "hi"}], + model="override_model", + ) + + async def test_responses(self, monkeypatch: MonkeyPatch) -> None: + server = self._setup_server(monkeypatch) + app = server.setup_webserver() + client = TestClient(app) + + monkeypatch.setattr("responses_api_models.azure_openai_model.app.uuid4", lambda: FakeUUID()) + monkeypatch.setattr("responses_api_models.azure_openai_model.app.time", lambda: FIXED_TIME) + monkeypatch.setattr("responses_api_models.vllm_model.app.uuid4", lambda: FakeUUID()) + + mock_response_data = NeMoGymChatCompletion( + id="chtcmpl-123", + choices=[ + NeMoGymChoice( + index=0, + finish_reason="stop", + message=NeMoGymChatCompletionMessage(role="assistant", content="Hello! How can I help you today?"), + ) + ], + created=FIXED_TIME, + model="dummy_model", + object="chat.completion", + ) + + # Expected response + expected_response = NeMoGymResponse( + id="resp_123", + created_at=FIXED_TIME, + model="dummy_model", + object="response", + output=[ + NeMoGymResponseOutputMessage( + id="msg_123", + content=[ + { + "annotations": [], + "text": "Hello! How can I help you today?", + "type": "output_text", + } + ], + role="assistant", + status="completed", + type="message", + ) + ], + parallel_tool_calls=True, + tool_choice="auto", + tools=[], + ) + + responses_create_params = NeMoGymResponseCreateParamsNonStreaming(input="hello") + + # Mock the Azure OpenAI client directly since responses() calls it directly + server._client = MagicMock(spec=AsyncAzureOpenAI) + server._client.chat.completions.create = AsyncMock(return_value=mock_response_data) + + response = client.post( + "/v1/responses", + json=responses_create_params.model_dump(exclude_unset=True, mode="json"), + ) + assert response.status_code == 200 + assert expected_response.model_dump() == response.json()