Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 0 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -160,7 +160,6 @@ Purpose: Training-ready environments with curated datasets.
| Mini Swe Agent | coding | <a href='https://huggingface.co/datasets/SWE-Gym/SWE-Gym'>SWE-Gym</a> | A software development with mini-swe-agent orchestration | Improve software development capabilities, like SWE-bench | <a href='resources_servers/mini_swe_agent/configs/mini_swe_agent.yaml'>config</a> | ✓ | ✓ | MIT |
| Instruction Following | instruction_following | <a href='https://huggingface.co/datasets/nvidia/Nemotron-RL-instruction_following'>Nemotron-RL-instruction_following</a> | Instruction following datasets targeting IFEval and IFBench style instruction following capabilities | Improve IFEval and IFBench | <a href='resources_servers/instruction_following/configs/instruction_following.yaml'>config</a> | ✓ | - | Apache 2.0 |
| Structured Outputs | instruction_following | <a href='https://huggingface.co/datasets/nvidia/Nemotron-RL-instruction_following-structured_outputs'>Nemotron-RL-instruction_following-structured_outputs</a> | Check if responses are following structured output requirements in prompts | Improve instruction following capabilities | <a href='resources_servers/structured_outputs/configs/structured_outputs_json.yaml'>config</a> | ✓ | ✓ | Apache 2.0 |
| Equivalence Llm Judge | knowledge | <a href='https://huggingface.co/datasets/nvidia/Nemotron-RL-knowledge-openQA'>Nemotron-RL-knowledge-openQA</a> | Short answer questions with LLM-as-a-judge | Improve knowledge-related benchmarks like GPQA / HLE | <a href='resources_servers/equivalence_llm_judge/configs/equivalence_llm_judge.yaml'>config</a> | ✓ | - | Apache 2.0 |
| Mcqa | knowledge | <a href='https://huggingface.co/datasets/nvidia/Nemotron-RL-knowledge-mcqa'>Nemotron-RL-knowledge-mcqa</a> | Multi-choice question answering problems | Improve benchmarks like MMLU / GPQA / HLE | <a href='resources_servers/mcqa/configs/mcqa.yaml'>config</a> | ✓ | - | Apache 2.0 |
| Math With Judge | math | <a href='https://huggingface.co/datasets/nvidia/Nemotron-RL-math-OpenMathReasoning'>Nemotron-RL-math-OpenMathReasoning</a> | Math dataset with math-verify and LLM-as-a-judge | Improve math capabilities including AIME 24 / 25 | <a href='resources_servers/math_with_judge/configs/math_with_judge.yaml'>config</a> | ✓ | ✓ | Creative Commons Attribution 4.0 International |
| Math With Judge | math | <a href='https://huggingface.co/datasets/nvidia/Nemotron-RL-math-stack_overflow'>Nemotron-RL-math-stack_overflow</a> | - | - | <a href='resources_servers/math_with_judge/configs/math_stack_overflow.yaml'>config</a> | ✓ | ✓ | Creative Commons Attribution-ShareAlike 4.0 International |
Expand Down
22 changes: 21 additions & 1 deletion resources_servers/calendar/app.py
Original file line number Diff line number Diff line change
Expand Up @@ -47,7 +47,27 @@ def setup_webserver(self) -> FastAPI:
return app

async def verify(self, body: CalendarVerifyRequest) -> BaseVerifyResponse:
assistant_response = body.response.output[-1].content[0].text
# Extract the assistant's text response from the last output item.
#
# For reasoning models (e.g., with deepseek_r1 reasoning_parser), the output
# structure is: [ReasoningItem, MessageItem] where:
# - ReasoningItem: has .reasoning attribute (thinking/CoT tokens)
# - MessageItem: has .content attribute (actual response text)
#
# The last item should be a MessageItem with .content, but if the model
# hit the token limit while still thinking, the last item will be a
# ReasoningItem without .content. In that case, we return reward=0.
assistant_response = ""
if body.response.output:
last_output = body.response.output[-1]
if hasattr(last_output, "content") and last_output.content:
assistant_response = last_output.content[0].text

# If no valid response (e.g., model only produced thinking tokens),
# return zero reward
if not assistant_response:
return BaseVerifyResponse(**body.model_dump(), reward=0)

exp_cal_state = body.exp_cal_state
try:
reward, reason = grade_assistant_response(assistant_response, exp_cal_state)
Expand Down
41 changes: 33 additions & 8 deletions resources_servers/equivalence_llm_judge/app.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,9 @@
# limitations under the License.
from __future__ import annotations

import asyncio
import re
from contextlib import nullcontext
from typing import Any, Optional

from fastapi import FastAPI
Expand Down Expand Up @@ -59,8 +61,11 @@ class LLMJudgeResourcesServerConfig(BaseResourcesServerConfig):
judge_model_server: ModelServerRef
judge_responses_create_params: NeMoGymResponseCreateParamsNonStreaming

# Concurrency limit for judge endpoint requests. Set to None to disable limiting.
judge_endpoint_max_concurrency: Optional[int] = 64

judge_system_message: Optional[str] = None
judge_prompt_template: str
judge_prompt_template_fpath: str = "prompt_templates/equivalence_llm_judge.txt"
judge_equal_label: str = "[[A=B]]"
judge_not_equal_label: str = "[[A!=B]]"
# Optional regex to extract the question from the last user message.
Expand Down Expand Up @@ -250,6 +255,17 @@ class LLMJudgeResourcesServer(SimpleResourcesServer):

config: LLMJudgeResourcesServerConfig

def __init__(self, *args, **kwargs):
super().__init__(*args, **kwargs)

if self.config.judge_endpoint_max_concurrency is not None:
self._judge_endpoint_max_concurrency = asyncio.Semaphore(value=self.config.judge_endpoint_max_concurrency)
else:
self._judge_endpoint_max_concurrency = nullcontext()

with open(self.config.judge_prompt_template_fpath, "r") as f:
self._judge_prompt_template = f.read().strip()

def setup_webserver(self) -> FastAPI:
app = super().setup_webserver()
return app
Expand Down Expand Up @@ -420,7 +436,7 @@ async def _generate_judge_evaluation(
not_equal_label = cfg.judge_not_equal_label

responses_create_params = cfg.judge_responses_create_params.model_copy(deep=True)
prompt_template = cfg.judge_prompt_template
prompt_template = self._judge_prompt_template
system_message = cfg.judge_system_message

user_prompt = prompt_template.format(
Expand All @@ -433,12 +449,21 @@ async def _generate_judge_evaluation(
msgs.append(NeMoGymEasyInputMessage(role="user", content=user_prompt))
responses_create_params.input = msgs

response = await self.server_client.post(
server_name=cfg.judge_model_server.name,
url_path="/v1/responses",
json=responses_create_params,
)
judge_response = NeMoGymResponse.model_validate(await get_response_json(response))
async with self._judge_endpoint_max_concurrency:
try:
response = await self.server_client.post(
server_name=cfg.judge_model_server.name,
url_path="/v1/responses",
json=responses_create_params,
)
judge_response = NeMoGymResponse.model_validate(await get_response_json(response))
except Exception as e:
print(
f"DEBUG: LLMJudgeResourcesServer: judge model server HTTP POST error: {type(e).__name__} {e}",
flush=True,
)
raise e

eval_record = JudgeEvaluation(
responses_create_params=responses_create_params,
response=judge_response,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -7,64 +7,9 @@ equivalence_llm_judge:
name: policy_model
judge_responses_create_params:
input: []
judge_prompt_template: |-
===== System role =====
You are a meticulous STEM grader. Compare a candidate answer to a GOLD reference for a scientific question and decide strict equivalence.

Grading priorities (in order):
1) Factual equivalence to GOLD (accept algebraically/formally equivalent formulations).
2) Completeness on required parts — the candidate must include the same core parts/subclaims as the GOLD.

Rules:
- Treat GOLD as authoritative for what counts as correct.
- If GOLD is a range or set, the candidate is equivalent only if it lies within that range or is a member of that set.
- For formulas/derivations, accept mathematically identical transformations (e.g., symbol reordering, factoring, equivalent identities).
- Multi-part: all essential parts must match for “equivalent”; otherwise they are not equivalent.
- Be concise. Do NOT reveal or rewrite the GOLD.

Show your reason why they are equivalent or not equivalent first and then provide the output.

Output (at the end after double newlines):
- If equivalent: [[A=B]] they are equivalent
- If not equivalent: [[A!=B]] they are not equivalent

===== Example 1 (equivalent) =====
QUESTION:
State Avogadro’s constant (include units).

GOLD:
6.022 × 10^23 mol^-1

CANDIDATE:
6.022e23 per mole.

The candidate gives the same magnitude in scientific notation and the same “per mole” unit; no extra or missing qualifiers.

[[A=B]] they are equivalent

===== Example 2 (not equivalent) =====
QUESTION:
State the first law of thermodynamics for a closed system and identify what each symbol represents.

GOLD:
ΔU = Q − W ; ΔU is change in internal energy, Q is heat added to the system, W is work done by the system.

CANDIDATE:
ΔU = Q + W ; ΔU is change in internal energy, Q is heat added to the system, W is work done by the system.

The candidate uses the opposite sign convention for work relative to the required relationship; one core part is incorrect, so the overall statement does not match.

[[A!=B]] they are not equivalent

===== Inputs =====
QUESTION:
{question}

GOLD:
{expected_answer}

CANDIDATE:
{generated_answer}
judge_prompt_template_fpath: prompt_templates/equivalence_llm_judge.txt
# Concurrency and rate limiting configuration
judge_endpoint_max_concurrency: 64 # Reduced from 64 to avoid rate limits
judge_system_message: null
judge_equal_label: "[[A=B]]"
judge_not_equal_label: "[[A!=B]]"
Expand Down Expand Up @@ -135,7 +80,7 @@ equivalence_llm_judge_simple_agent:
license: "TBD"
jsonl_fpath: resources_servers/equivalence_llm_judge/data/example_openqa.jsonl
- name: train
type: train
type: example
license: Apache 2.0
jsonl_fpath: resources_servers/equivalence_llm_judge/data/train.jsonl
huggingface_identifier:
Expand Down
1 change: 1 addition & 0 deletions resources_servers/equivalence_llm_judge/configs/lc.yaml
35 changes: 35 additions & 0 deletions resources_servers/equivalence_llm_judge/configs/lc_judge.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,35 @@
lc_judge:
resources_servers:
equivalence_llm_judge:
entrypoint: app.py
judge_model_server:
type: responses_api_models
name: policy_model
judge_responses_create_params:
input: []
judge_prompt_template_fpath: prompt_templates/lc_judge.txt
judge_endpoint_max_concurrency: 64
judge_system_message: null
judge_equal_label: CORRECT
judge_not_equal_label: INCORRECT
check_twice_swap: false
reward_if_swap_fails: 0.0
question_extract_regex: ^QUESTION:\s*(.*)$
response_extract_regex: null
domain: knowledge
verified: false
lc_judge_simple_agent:
responses_api_agents:
simple_agent:
entrypoint: app.py
resources_server:
type: resources_servers
name: lc_judge
model_server:
type: responses_api_models
name: policy_model
datasets:
- name: example
type: example
license: TBD
jsonl_fpath: resources_servers/equivalence_llm_judge/data/example.jsonl
Original file line number Diff line number Diff line change
@@ -0,0 +1,58 @@

===== System role =====
You are a meticulous STEM grader. Compare a candidate answer to a GOLD reference for a scientific question and decide strict equivalence.

Grading priorities (in order):
1) Factual equivalence to GOLD (accept algebraically/formally equivalent formulations).
2) Completeness on required parts — the candidate must include the same core parts/subclaims as the GOLD.

Rules:
- Treat GOLD as authoritative for what counts as correct.
- If GOLD is a range or set, the candidate is equivalent only if it lies within that range or is a member of that set.
- For formulas/derivations, accept mathematically identical transformations (e.g., symbol reordering, factoring, equivalent identities).
- Multi-part: all essential parts must match for “equivalent”; otherwise they are not equivalent.
- Be concise. Do NOT reveal or rewrite the GOLD.

Show your reason why they are equivalent or not equivalent first and then provide the output.

Output (at the end after double newlines):
- If equivalent: [[A=B]] they are equivalent
- If not equivalent: [[A!=B]] they are not equivalent

===== Example 1 (equivalent) =====
QUESTION:
State Avogadro’s constant (include units).

GOLD:
6.022 × 10^23 mol^-1

CANDIDATE:
6.022e23 per mole.

The candidate gives the same magnitude in scientific notation and the same “per mole” unit; no extra or missing qualifiers.

[[A=B]] they are equivalent

===== Example 2 (not equivalent) =====
QUESTION:
State the first law of thermodynamics for a closed system and identify what each symbol represents.

GOLD:
ΔU = Q − W ; ΔU is change in internal energy, Q is heat added to the system, W is work done by the system.

CANDIDATE:
ΔU = Q + W ; ΔU is change in internal energy, Q is heat added to the system, W is work done by the system.

The candidate uses the opposite sign convention for work relative to the required relationship; one core part is incorrect, so the overall statement does not match.

[[A!=B]] they are not equivalent

===== Inputs =====
QUESTION:
{question}

GOLD:
{expected_answer}

CANDIDATE:
{generated_answer}
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@

Assess whether the following CANDIDATE ANSWER is CORRECT or INCORRECT.
For the CANDIDATE ANSWER to be correct, it must be consistent with the OFFICIAL ANSWER.

The question, for reference only: {question}
The OFFICIAL ANSWER: {expected_answer}
CANDIDATE ANSWER TO ASSESS: {generated_answer}

Reply only with CORRECT or INCORRECT.
10 changes: 3 additions & 7 deletions resources_servers/equivalence_llm_judge/tests/test_app.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,6 @@
from pathlib import Path
from unittest.mock import AsyncMock, MagicMock

from omegaconf import OmegaConf
from pytest import approx, fixture

from nemo_gym.config_types import ModelServerRef
Expand All @@ -38,11 +37,8 @@
class TestApp:
@fixture
def config(self) -> LLMJudgeResourcesServerConfig:
# Load judge template from YAML so tests mirror runtime config
yaml_path = Path(__file__).resolve().parents[1] / "configs" / "equivalence_llm_judge.yaml"
yaml_cfg = OmegaConf.load(str(yaml_path))
judge_template: str = (
yaml_cfg.equivalence_llm_judge.resources_servers.equivalence_llm_judge.judge_prompt_template
judge_prompt_template_fpath = str(
Path(__file__).resolve().parents[1] / "prompt_templates/equivalence_llm_judge.txt"
)

cfg = LLMJudgeResourcesServerConfig(
Expand All @@ -51,7 +47,7 @@ def config(self) -> LLMJudgeResourcesServerConfig:
entrypoint="",
judge_model_server=ModelServerRef(type="responses_api_models", name="judge"),
judge_responses_create_params=NeMoGymResponseCreateParamsNonStreaming(input=[]),
judge_prompt_template=judge_template,
judge_prompt_template_fpath=judge_prompt_template_fpath,
)
cfg.judge_equal_label = "[[A=B]]"
cfg.judge_not_equal_label = "[[A!=B]]"
Expand Down
6 changes: 6 additions & 0 deletions resources_servers/multichallenge/.gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
# Ignore data files (datasets should not be committed)
data/advanced/
data/vanilla/
*.json
*.jsonl
!configs/*.yaml
Loading