Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
48 changes: 48 additions & 0 deletions responses_api_models/azure_openai_model/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,48 @@
# Azure OpenAI Model

Use this model server to access Azure OpenAI endpoints for LLM-as-a-judge.
## Configuration

Set up your `env.yaml` file:
```yaml
policy_base_url: https://my.end.point.com/v1/azure
policy_api_key: <API_KEY>
policy_model_name: gpt-5-nano
policy_api_version: 2024-10-21
```

## Usage

### Running the server
```bash
config_paths="responses_api_models/azure_openai_model/configs/azure_openai_model.yaml, \
resources_servers/equivalence_llm_judge/configs/equivalence_llm_judge.yaml"

ng_run "+config_paths=[${config_paths}]" \
+policy_model.responses_api_models.azure_openai_model.default_query.api-version=<api_version>
```

### Collecting Rollouts

```bash
ng_collect_rollouts \
+agent_name=equivalence_llm_judge_simple_agent \
+input_jsonl_fpath=resources_servers/equivalence_llm_judge/data/example.jsonl \
+output_jsonl_fpath=results/example_rollouts.jsonl \
+limit=5
```

### Test cases

```bash
ng_test +entrypoint=responses_api_models/azure_openai_model
```

## Licensing information

- **Code**: Apache 2.0
- **Data**: N/A

## Dependencies

- `nemo_gym`: Apache 2.0
104 changes: 104 additions & 0 deletions responses_api_models/azure_openai_model/app.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,104 @@
# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from asyncio import Semaphore
from time import time
from uuid import uuid4

from fastapi import Request
from openai import AsyncAzureOpenAI

from nemo_gym.base_responses_api_model import (
BaseResponsesAPIModelConfig,
Body,
SimpleResponsesAPIModel,
)
from nemo_gym.openai_utils import (
NeMoGymChatCompletion,
NeMoGymChatCompletionCreateParamsNonStreaming,
NeMoGymResponse,
NeMoGymResponseCreateParamsNonStreaming,
)
from responses_api_models.vllm_model.app import VLLMConverter


class AzureOpenAIModelServerConfig(BaseResponsesAPIModelConfig):
openai_base_url: str
openai_api_key: str
openai_model: str
default_query: dict
num_concurrent_requests: int


class AzureOpenAIModelServer(SimpleResponsesAPIModel):
config: AzureOpenAIModelServerConfig

def model_post_init(self, context):
self._client = AsyncAzureOpenAI(
azure_endpoint=self.config.openai_base_url,
api_key=self.config.openai_api_key,
api_version=self.config.default_query.get("api-version"),
)
self._converter = VLLMConverter(return_token_id_information=False)
self._semaphore: Semaphore = Semaphore(self.config.num_concurrent_requests)
return super().model_post_init(context)

async def responses(
self, request: Request, body: NeMoGymResponseCreateParamsNonStreaming = Body()
) -> NeMoGymResponse:
async with self._semaphore:
chat_completion_create_params = self._converter.responses_to_chat_completion_create_params(body)
chat_completion_params_dict = chat_completion_create_params.model_dump(exclude_unset=True)
chat_completion_params_dict.setdefault("model", self.config.openai_model)
chat_completion_response = await self._client.chat.completions.create(**chat_completion_params_dict)

choice = chat_completion_response.choices[0]
response_output = self._converter.postprocess_chat_response(choice)
response_output_dicts = [item.model_dump() for item in response_output]
return NeMoGymResponse(
id=f"resp_{uuid4().hex}",
created_at=int(time()),
model=self.config.openai_model,
object="response",
output=response_output_dicts,
tool_choice=body.tool_choice if "tool_choice" in body else "auto",
parallel_tool_calls=body.parallel_tool_calls,
tools=body.tools,
temperature=body.temperature,
top_p=body.top_p,
background=body.background,
max_output_tokens=body.max_output_tokens,
max_tool_calls=body.max_tool_calls,
previous_response_id=body.previous_response_id,
prompt=body.prompt,
reasoning=body.reasoning,
service_tier=body.service_tier,
text=body.text,
top_logprobs=body.top_logprobs,
truncation=body.truncation,
metadata=body.metadata,
instructions=body.instructions,
user=body.user,
)

async def chat_completions(
self, request: Request, body: NeMoGymChatCompletionCreateParamsNonStreaming = Body()
) -> NeMoGymChatCompletion:
body_dict = body.model_dump(exclude_unset=True)
body_dict.setdefault("model", self.config.openai_model)
openai_response_dict = await self._client.chat.completions.create(**body_dict)
return NeMoGymChatCompletion.model_validate(openai_response_dict)


if __name__ == "__main__":
AzureOpenAIModelServer.run_webserver()
40 changes: 40 additions & 0 deletions responses_api_models/azure_openai_model/client.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,40 @@
# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from asyncio import run

from nemo_gym.server_utils import ServerClient


server_client = ServerClient.load_from_global_config()


async def main():
task_1 = await server_client.post(
server_name="policy_model",
url_path="/v1/responses",
json={"input": "hello"},
)
task_2 = await server_client.post(
server_name="policy_model",
url_path="/v1/chat/completions",
json={
"messages": [{"role": "user", "content": "hello"}],
},
)
print(await task_1.json())
print(await task_2.json())


if __name__ == "__main__":
run(main())
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
policy_model:
responses_api_models:
azure_openai_model:
entrypoint: app.py
openai_base_url: ${policy_base_url}
openai_api_key: ${policy_api_key}
openai_model: ${policy_model_name}
# for azure openai only
default_query:
api-version: ???
num_concurrent_requests: 8
1 change: 1 addition & 0 deletions responses_api_models/azure_openai_model/requirements.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
-e nemo-gym[dev] @ ../../
178 changes: 178 additions & 0 deletions responses_api_models/azure_openai_model/tests/test_app.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,178 @@
# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from unittest.mock import AsyncMock, MagicMock

from fastapi.testclient import TestClient
from openai import AsyncAzureOpenAI
from pytest import MonkeyPatch

from nemo_gym.openai_utils import (
NeMoGymChatCompletion,
NeMoGymChatCompletionMessage,
NeMoGymChoice,
NeMoGymResponse,
NeMoGymResponseCreateParamsNonStreaming,
NeMoGymResponseOutputMessage,
)
from nemo_gym.server_utils import ServerClient
from responses_api_models.azure_openai_model.app import (
AzureOpenAIModelServer,
AzureOpenAIModelServerConfig,
)


# Used for mocking created_at timestamp generation
FIXED_TIME = 1691418000
FIXED_UUID = "123"


class FakeUUID:
"""Used for mocking UUIDs"""

hex = FIXED_UUID


class TestApp:
def _setup_server(self, monkeypatch=None):
config = AzureOpenAIModelServerConfig(
host="0.0.0.0",
port=8081,
openai_base_url="https://prod.api.nvidia.com/llm/v1/azure",
openai_api_key="dummy_key", # pragma: allowlist secret
openai_model="dummy_model",
default_query={"api-version": "dummy_version"},
num_concurrent_requests=8,
entrypoint="",
name="",
)
return AzureOpenAIModelServer(config=config, server_client=MagicMock(spec=ServerClient))

async def test_sanity(self) -> None:
self._setup_server()

async def test_chat_completions(self, monkeypatch: MonkeyPatch) -> None:
server = self._setup_server()
app = server.setup_webserver()
client = TestClient(app)

mock_chat_data = {
"id": "chatcmpl-BzRdCFjIEIp59xXLBNYjdPPrcpDaa", # pragma: allowlist secret
"choices": [
{
"finish_reason": "stop",
"index": 0,
"message": {
"content": "Hello! How can I help you today?",
"role": "assistant",
},
}
],
"created": 1753983922,
"model": "dummy_model",
"object": "chat.completion",
}

called_args_chat = {}

async def mock_create_chat(**kwargs):
nonlocal called_args_chat
called_args_chat = kwargs
return mock_chat_data

server._client = MagicMock(spec=AsyncAzureOpenAI)
server._client.chat.completions.create = AsyncMock(side_effect=mock_create_chat)

chat_no_model = client.post(
"/v1/chat/completions",
json={"messages": [{"role": "user", "content": "hi"}]},
)
assert chat_no_model.status_code == 200
assert called_args_chat.get("model") == "dummy_model"

chat_with_model = client.post(
"/v1/chat/completions",
json={
"messages": [{"role": "user", "content": "hi"}],
"model": "override_model",
},
)
assert chat_with_model.status_code == 200
assert called_args_chat.get("model") == "override_model"

server._client.chat.completions.create.assert_any_await(
messages=[{"role": "user", "content": "hi"}],
model="override_model",
)

async def test_responses(self, monkeypatch: MonkeyPatch) -> None:
server = self._setup_server(monkeypatch)
app = server.setup_webserver()
client = TestClient(app)

monkeypatch.setattr("responses_api_models.azure_openai_model.app.uuid4", lambda: FakeUUID())
monkeypatch.setattr("responses_api_models.azure_openai_model.app.time", lambda: FIXED_TIME)
monkeypatch.setattr("responses_api_models.vllm_model.app.uuid4", lambda: FakeUUID())

mock_response_data = NeMoGymChatCompletion(
id="chtcmpl-123",
choices=[
NeMoGymChoice(
index=0,
finish_reason="stop",
message=NeMoGymChatCompletionMessage(role="assistant", content="Hello! How can I help you today?"),
)
],
created=FIXED_TIME,
model="dummy_model",
object="chat.completion",
)

# Expected response
expected_response = NeMoGymResponse(
id="resp_123",
created_at=FIXED_TIME,
model="dummy_model",
object="response",
output=[
NeMoGymResponseOutputMessage(
id="msg_123",
content=[
{
"annotations": [],
"text": "Hello! How can I help you today?",
"type": "output_text",
}
],
role="assistant",
status="completed",
type="message",
)
],
parallel_tool_calls=True,
tool_choice="auto",
tools=[],
)

responses_create_params = NeMoGymResponseCreateParamsNonStreaming(input="hello")

# Mock the Azure OpenAI client directly since responses() calls it directly
server._client = MagicMock(spec=AsyncAzureOpenAI)
server._client.chat.completions.create = AsyncMock(return_value=mock_response_data)

response = client.post(
"/v1/responses",
json=responses_create_params.model_dump(exclude_unset=True, mode="json"),
)
assert response.status_code == 200
assert expected_response.model_dump() == response.json()