NVIDIA-NeMo · bxyu-nvidia · Feb 6, 2026 · Feb 5, 2026 · Feb 6, 2026 · Feb 6, 2026
diff --git a/README.md b/README.md
@@ -160,7 +160,6 @@ Purpose: Training-ready environments with curated datasets.
 | Mini Swe Agent             | coding                | <a href='https://huggingface.co/datasets/SWE-Gym/SWE-Gym'>SWE-Gym</a>                                                                                          | A software development with mini-swe-agent orchestration                                             | Improve software development capabilities, like SWE-bench                | <a href='resources_servers/mini_swe_agent/configs/mini_swe_agent.yaml'>config</a>                         | ✓     | ✓          | MIT                                                       |
 | Instruction Following      | instruction_following | <a href='https://huggingface.co/datasets/nvidia/Nemotron-RL-instruction_following'>Nemotron-RL-instruction_following</a>                                       | Instruction following datasets targeting IFEval and IFBench style instruction following capabilities | Improve IFEval and IFBench                                               | <a href='resources_servers/instruction_following/configs/instruction_following.yaml'>config</a>           | ✓     | -          | Apache 2.0                                                |
 | Structured Outputs         | instruction_following | <a href='https://huggingface.co/datasets/nvidia/Nemotron-RL-instruction_following-structured_outputs'>Nemotron-RL-instruction_following-structured_outputs</a> | Check if responses are following structured output requirements in prompts                           | Improve instruction following capabilities                               | <a href='resources_servers/structured_outputs/configs/structured_outputs_json.yaml'>config</a>            | ✓     | ✓          | Apache 2.0                                                |
-| Equivalence Llm Judge      | knowledge             | <a href='https://huggingface.co/datasets/nvidia/Nemotron-RL-knowledge-openQA'>Nemotron-RL-knowledge-openQA</a>                                                 | Short answer questions with LLM-as-a-judge                                                           | Improve knowledge-related benchmarks like GPQA / HLE                     | <a href='resources_servers/equivalence_llm_judge/configs/equivalence_llm_judge.yaml'>config</a>           | ✓     | -          | Apache 2.0                                                |
 | Mcqa                       | knowledge             | <a href='https://huggingface.co/datasets/nvidia/Nemotron-RL-knowledge-mcqa'>Nemotron-RL-knowledge-mcqa</a>                                                     | Multi-choice question answering problems                                                             | Improve benchmarks like MMLU / GPQA / HLE                                | <a href='resources_servers/mcqa/configs/mcqa.yaml'>config</a>                                             | ✓     | -          | Apache 2.0                                                |
 | Math With Judge            | math                  | <a href='https://huggingface.co/datasets/nvidia/Nemotron-RL-math-OpenMathReasoning'>Nemotron-RL-math-OpenMathReasoning</a>                                     | Math dataset with math-verify and LLM-as-a-judge                                                     | Improve math capabilities including AIME 24 / 25                         | <a href='resources_servers/math_with_judge/configs/math_with_judge.yaml'>config</a>                       | ✓     | ✓          | Creative Commons Attribution 4.0 International            |
 | Math With Judge            | math                  | <a href='https://huggingface.co/datasets/nvidia/Nemotron-RL-math-stack_overflow'>Nemotron-RL-math-stack_overflow</a>                                           | -                                                                                                    | -                                                                        | <a href='resources_servers/math_with_judge/configs/math_stack_overflow.yaml'>config</a>                   | ✓     | ✓          | Creative Commons Attribution-ShareAlike 4.0 International |

diff --git a/resources_servers/calendar/app.py b/resources_servers/calendar/app.py
@@ -47,7 +47,27 @@ def setup_webserver(self) -> FastAPI:
         return app
 
     async def verify(self, body: CalendarVerifyRequest) -> BaseVerifyResponse:
-        assistant_response = body.response.output[-1].content[0].text
+        # Extract the assistant's text response from the last output item.
+        #
+        # For reasoning models (e.g., with deepseek_r1 reasoning_parser), the output
+        # structure is: [ReasoningItem, MessageItem] where:
+        #   - ReasoningItem: has .reasoning attribute (thinking/CoT tokens)
+        #   - MessageItem: has .content attribute (actual response text)
+        #
+        # The last item should be a MessageItem with .content, but if the model
+        # hit the token limit while still thinking, the last item will be a
+        # ReasoningItem without .content. In that case, we return reward=0.
+        assistant_response = ""
+        if body.response.output:
+            last_output = body.response.output[-1]
+            if hasattr(last_output, "content") and last_output.content:
+                assistant_response = last_output.content[0].text
+
+        # If no valid response (e.g., model only produced thinking tokens),
+        # return zero reward
+        if not assistant_response:
+            return BaseVerifyResponse(**body.model_dump(), reward=0)
+
         exp_cal_state = body.exp_cal_state
         try:
             reward, reason = grade_assistant_response(assistant_response, exp_cal_state)

diff --git a/resources_servers/equivalence_llm_judge/app.py b/resources_servers/equivalence_llm_judge/app.py
@@ -21,7 +21,9 @@
 # limitations under the License.
 from __future__ import annotations
 
+import asyncio
 import re
+from contextlib import nullcontext
 from typing import Any, Optional
 
 from fastapi import FastAPI
@@ -59,8 +61,11 @@ class LLMJudgeResourcesServerConfig(BaseResourcesServerConfig):
     judge_model_server: ModelServerRef
     judge_responses_create_params: NeMoGymResponseCreateParamsNonStreaming
 
+    # Concurrency limit for judge endpoint requests. Set to None to disable limiting.
+    judge_endpoint_max_concurrency: Optional[int] = 64
+
     judge_system_message: Optional[str] = None
-    judge_prompt_template: str
+    judge_prompt_template_fpath: str = "prompt_templates/equivalence_llm_judge.txt"
     judge_equal_label: str = "[[A=B]]"
     judge_not_equal_label: str = "[[A!=B]]"
     # Optional regex to extract the question from the last user message.
@@ -250,6 +255,17 @@ class LLMJudgeResourcesServer(SimpleResourcesServer):
 
     config: LLMJudgeResourcesServerConfig
 
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+
+        if self.config.judge_endpoint_max_concurrency is not None:
+            self._judge_endpoint_max_concurrency = asyncio.Semaphore(value=self.config.judge_endpoint_max_concurrency)
+        else:
+            self._judge_endpoint_max_concurrency = nullcontext()
+
+        with open(self.config.judge_prompt_template_fpath, "r") as f:
+            self._judge_prompt_template = f.read().strip()
+
     def setup_webserver(self) -> FastAPI:
         app = super().setup_webserver()
         return app
@@ -420,7 +436,7 @@ async def _generate_judge_evaluation(
         not_equal_label = cfg.judge_not_equal_label
 
         responses_create_params = cfg.judge_responses_create_params.model_copy(deep=True)
-        prompt_template = cfg.judge_prompt_template
+        prompt_template = self._judge_prompt_template
         system_message = cfg.judge_system_message
 
         user_prompt = prompt_template.format(
@@ -433,12 +449,21 @@ async def _generate_judge_evaluation(
         msgs.append(NeMoGymEasyInputMessage(role="user", content=user_prompt))
         responses_create_params.input = msgs
 
-        response = await self.server_client.post(
-            server_name=cfg.judge_model_server.name,
-            url_path="/v1/responses",
-            json=responses_create_params,
-        )
-        judge_response = NeMoGymResponse.model_validate(await get_response_json(response))
+        async with self._judge_endpoint_max_concurrency:
+            try:
+                response = await self.server_client.post(
+                    server_name=cfg.judge_model_server.name,
+                    url_path="/v1/responses",
+                    json=responses_create_params,
+                )
+                judge_response = NeMoGymResponse.model_validate(await get_response_json(response))
+            except Exception as e:
+                print(
+                    f"DEBUG: LLMJudgeResourcesServer: judge model server HTTP POST error: {type(e).__name__} {e}",
+                    flush=True,
+                )
+                raise e
+
         eval_record = JudgeEvaluation(
             responses_create_params=responses_create_params,
             response=judge_response,

diff --git a/resources_servers/equivalence_llm_judge/configs/equivalence_llm_judge.yaml b/resources_servers/equivalence_llm_judge/configs/equivalence_llm_judge.yaml
@@ -7,64 +7,9 @@ equivalence_llm_judge:
         name: policy_model
       judge_responses_create_params:
         input: []
-      judge_prompt_template: |-
-        ===== System role =====
-        You are a meticulous STEM grader. Compare a candidate answer to a GOLD reference for a scientific question and decide strict equivalence.
-
-        Grading priorities (in order):
-        1) Factual equivalence to GOLD (accept algebraically/formally equivalent formulations).
-        2) Completeness on required parts — the candidate must include the same core parts/subclaims as the GOLD.
-
-        Rules:
-        - Treat GOLD as authoritative for what counts as correct.
-        - If GOLD is a range or set, the candidate is equivalent only if it lies within that range or is a member of that set.
-        - For formulas/derivations, accept mathematically identical transformations (e.g., symbol reordering, factoring, equivalent identities).
-        - Multi-part: all essential parts must match for “equivalent”; otherwise they are not equivalent.
-        - Be concise. Do NOT reveal or rewrite the GOLD.
-
-        Show your reason why they are equivalent or not equivalent first and then provide the output.
-
-        Output (at the end after double newlines):
-        - If equivalent: [[A=B]] they are equivalent
-        - If not equivalent: [[A!=B]] they are not equivalent
-
-        ===== Example 1 (equivalent) =====
-        QUESTION:
-        State Avogadro’s constant (include units).
-
-        GOLD:
-        6.022 × 10^23 mol^-1
-
-        CANDIDATE:
-        6.022e23 per mole.
-
-        The candidate gives the same magnitude in scientific notation and the same “per mole” unit; no extra or missing qualifiers.
-
-        [[A=B]] they are equivalent
-
-        ===== Example 2 (not equivalent) =====
-        QUESTION:
-        State the first law of thermodynamics for a closed system and identify what each symbol represents.
-
-        GOLD:
-        ΔU = Q − W ; ΔU is change in internal energy, Q is heat added to the system, W is work done by the system.
-
-        CANDIDATE:
-        ΔU = Q + W ; ΔU is change in internal energy, Q is heat added to the system, W is work done by the system.
-
-        The candidate uses the opposite sign convention for work relative to the required relationship; one core part is incorrect, so the overall statement does not match.
-
-        [[A!=B]] they are not equivalent
-
-        ===== Inputs =====
-        QUESTION:
-        {question}
-
-        GOLD:
-        {expected_answer}
-
-        CANDIDATE:
-        {generated_answer}
+      judge_prompt_template_fpath: prompt_templates/equivalence_llm_judge.txt
+      # Concurrency and rate limiting configuration
+      judge_endpoint_max_concurrency: 64 # Reduced from 64 to avoid rate limits
       judge_system_message: null
       judge_equal_label: "[[A=B]]"
       judge_not_equal_label: "[[A!=B]]"
@@ -135,7 +80,7 @@ equivalence_llm_judge_simple_agent:
         license: "TBD"
         jsonl_fpath: resources_servers/equivalence_llm_judge/data/example_openqa.jsonl
       - name: train
-        type: train
+        type: example
         license: Apache 2.0
         jsonl_fpath: resources_servers/equivalence_llm_judge/data/train.jsonl
         huggingface_identifier:

diff --git a/resources_servers/equivalence_llm_judge/configs/lc.yaml b/resources_servers/equivalence_llm_judge/configs/lc.yaml
@@ -0,0 +1 @@
+lc_judge.yaml
diff --git a/resources_servers/equivalence_llm_judge/configs/lc_judge.yaml b/resources_servers/equivalence_llm_judge/configs/lc_judge.yaml
@@ -0,0 +1,35 @@
+lc_judge:
+  resources_servers:
+    equivalence_llm_judge:
+      entrypoint: app.py
+      judge_model_server:
+        type: responses_api_models
+        name: policy_model
+      judge_responses_create_params:
+        input: []
+      judge_prompt_template_fpath: prompt_templates/lc_judge.txt
+      judge_endpoint_max_concurrency: 64
+      judge_system_message: null
+      judge_equal_label: CORRECT
+      judge_not_equal_label: INCORRECT
+      check_twice_swap: false
+      reward_if_swap_fails: 0.0
+      question_extract_regex: ^QUESTION:\s*(.*)$
+      response_extract_regex: null
+      domain: knowledge
+      verified: false
+lc_judge_simple_agent:
+  responses_api_agents:
+    simple_agent:
+      entrypoint: app.py
+      resources_server:
+        type: resources_servers
+        name: lc_judge
+      model_server:
+        type: responses_api_models
+        name: policy_model
+      datasets:
+      - name: example
+        type: example
+        license: TBD
+        jsonl_fpath: resources_servers/equivalence_llm_judge/data/example.jsonl
diff --git a/resources_servers/equivalence_llm_judge/prompt_templates/equivalence_llm_judge.txt b/resources_servers/equivalence_llm_judge/prompt_templates/equivalence_llm_judge.txt
@@ -0,0 +1,58 @@
+
+===== System role =====
+You are a meticulous STEM grader. Compare a candidate answer to a GOLD reference for a scientific question and decide strict equivalence.
+
+Grading priorities (in order):
+1) Factual equivalence to GOLD (accept algebraically/formally equivalent formulations).
+2) Completeness on required parts — the candidate must include the same core parts/subclaims as the GOLD.
+
+Rules:
+- Treat GOLD as authoritative for what counts as correct.
+- If GOLD is a range or set, the candidate is equivalent only if it lies within that range or is a member of that set.
+- For formulas/derivations, accept mathematically identical transformations (e.g., symbol reordering, factoring, equivalent identities).
+- Multi-part: all essential parts must match for “equivalent”; otherwise they are not equivalent.
+- Be concise. Do NOT reveal or rewrite the GOLD.
+
+Show your reason why they are equivalent or not equivalent first and then provide the output.
+
+Output (at the end after double newlines):
+- If equivalent: [[A=B]] they are equivalent
+- If not equivalent: [[A!=B]] they are not equivalent
+
+===== Example 1 (equivalent) =====
+QUESTION:
+State Avogadro’s constant (include units).
+
+GOLD:
+6.022 × 10^23 mol^-1
+
+CANDIDATE:
+6.022e23 per mole.
+
+The candidate gives the same magnitude in scientific notation and the same “per mole” unit; no extra or missing qualifiers.
+
+[[A=B]] they are equivalent
+
+===== Example 2 (not equivalent) =====
+QUESTION:
+State the first law of thermodynamics for a closed system and identify what each symbol represents.
+
+GOLD:
+ΔU = Q − W ; ΔU is change in internal energy, Q is heat added to the system, W is work done by the system.
+
+CANDIDATE:
+ΔU = Q + W ; ΔU is change in internal energy, Q is heat added to the system, W is work done by the system.
+
+The candidate uses the opposite sign convention for work relative to the required relationship; one core part is incorrect, so the overall statement does not match.
+
+[[A!=B]] they are not equivalent
+
+===== Inputs =====
+QUESTION:
+{question}
+
+GOLD:
+{expected_answer}
+
+CANDIDATE:
+{generated_answer}
diff --git a/resources_servers/equivalence_llm_judge/prompt_templates/lc.txt b/resources_servers/equivalence_llm_judge/prompt_templates/lc.txt
@@ -0,0 +1 @@
+lc_judge.txt
diff --git a/resources_servers/equivalence_llm_judge/prompt_templates/lc_judge.txt b/resources_servers/equivalence_llm_judge/prompt_templates/lc_judge.txt
@@ -0,0 +1,9 @@
+
+Assess whether the following CANDIDATE ANSWER is CORRECT or INCORRECT.
+For the CANDIDATE ANSWER to be correct, it must be consistent with the OFFICIAL ANSWER.
+
+The question, for reference only: {question}
+The OFFICIAL ANSWER: {expected_answer}
+CANDIDATE ANSWER TO ASSESS: {generated_answer}
+
+Reply only with CORRECT or INCORRECT.
diff --git a/resources_servers/equivalence_llm_judge/tests/test_app.py b/resources_servers/equivalence_llm_judge/tests/test_app.py
@@ -16,7 +16,6 @@
 from pathlib import Path
 from unittest.mock import AsyncMock, MagicMock
 
-from omegaconf import OmegaConf
 from pytest import approx, fixture
 
 from nemo_gym.config_types import ModelServerRef
@@ -38,11 +37,8 @@
 class TestApp:
     @fixture
     def config(self) -> LLMJudgeResourcesServerConfig:
-        # Load judge template from YAML so tests mirror runtime config
-        yaml_path = Path(__file__).resolve().parents[1] / "configs" / "equivalence_llm_judge.yaml"
-        yaml_cfg = OmegaConf.load(str(yaml_path))
-        judge_template: str = (
-            yaml_cfg.equivalence_llm_judge.resources_servers.equivalence_llm_judge.judge_prompt_template
+        judge_prompt_template_fpath = str(
+            Path(__file__).resolve().parents[1] / "prompt_templates/equivalence_llm_judge.txt"
         )
 
         cfg = LLMJudgeResourcesServerConfig(
@@ -51,7 +47,7 @@ def config(self) -> LLMJudgeResourcesServerConfig:
             entrypoint="",
             judge_model_server=ModelServerRef(type="responses_api_models", name="judge"),
             judge_responses_create_params=NeMoGymResponseCreateParamsNonStreaming(input=[]),
-            judge_prompt_template=judge_template,
+            judge_prompt_template_fpath=judge_prompt_template_fpath,
         )
         cfg.judge_equal_label = "[[A=B]]"
         cfg.judge_not_equal_label = "[[A!=B]]"

diff --git a/resources_servers/multichallenge/.gitignore b/resources_servers/multichallenge/.gitignore
@@ -0,0 +1,6 @@
+# Ignore data files (datasets should not be committed)
+data/advanced/
+data/vanilla/
+*.json
+*.jsonl
+!configs/*.yaml