diff --git a/README.md b/README.md
index eb29dc65d..162ca6b80 100644
--- a/README.md
+++ b/README.md
@@ -160,7 +160,6 @@ Purpose: Training-ready environments with curated datasets.
| Mini Swe Agent | coding | SWE-Gym | A software development with mini-swe-agent orchestration | Improve software development capabilities, like SWE-bench | config | ✓ | ✓ | MIT |
| Instruction Following | instruction_following | Nemotron-RL-instruction_following | Instruction following datasets targeting IFEval and IFBench style instruction following capabilities | Improve IFEval and IFBench | config | ✓ | - | Apache 2.0 |
| Structured Outputs | instruction_following | Nemotron-RL-instruction_following-structured_outputs | Check if responses are following structured output requirements in prompts | Improve instruction following capabilities | config | ✓ | ✓ | Apache 2.0 |
-| Equivalence Llm Judge | knowledge | Nemotron-RL-knowledge-openQA | Short answer questions with LLM-as-a-judge | Improve knowledge-related benchmarks like GPQA / HLE | config | ✓ | - | Apache 2.0 |
| Mcqa | knowledge | Nemotron-RL-knowledge-mcqa | Multi-choice question answering problems | Improve benchmarks like MMLU / GPQA / HLE | config | ✓ | - | Apache 2.0 |
| Math With Judge | math | Nemotron-RL-math-OpenMathReasoning | Math dataset with math-verify and LLM-as-a-judge | Improve math capabilities including AIME 24 / 25 | config | ✓ | ✓ | Creative Commons Attribution 4.0 International |
| Math With Judge | math | Nemotron-RL-math-stack_overflow | - | - | config | ✓ | ✓ | Creative Commons Attribution-ShareAlike 4.0 International |
diff --git a/resources_servers/calendar/app.py b/resources_servers/calendar/app.py
index c844d7f4c..f0afeab64 100644
--- a/resources_servers/calendar/app.py
+++ b/resources_servers/calendar/app.py
@@ -47,7 +47,27 @@ def setup_webserver(self) -> FastAPI:
return app
async def verify(self, body: CalendarVerifyRequest) -> BaseVerifyResponse:
- assistant_response = body.response.output[-1].content[0].text
+ # Extract the assistant's text response from the last output item.
+ #
+ # For reasoning models (e.g., with deepseek_r1 reasoning_parser), the output
+ # structure is: [ReasoningItem, MessageItem] where:
+ # - ReasoningItem: has .reasoning attribute (thinking/CoT tokens)
+ # - MessageItem: has .content attribute (actual response text)
+ #
+ # The last item should be a MessageItem with .content, but if the model
+ # hit the token limit while still thinking, the last item will be a
+ # ReasoningItem without .content. In that case, we return reward=0.
+ assistant_response = ""
+ if body.response.output:
+ last_output = body.response.output[-1]
+ if hasattr(last_output, "content") and last_output.content:
+ assistant_response = last_output.content[0].text
+
+ # If no valid response (e.g., model only produced thinking tokens),
+ # return zero reward
+ if not assistant_response:
+ return BaseVerifyResponse(**body.model_dump(), reward=0)
+
exp_cal_state = body.exp_cal_state
try:
reward, reason = grade_assistant_response(assistant_response, exp_cal_state)
diff --git a/resources_servers/equivalence_llm_judge/app.py b/resources_servers/equivalence_llm_judge/app.py
index 3ff412310..a8da06f93 100644
--- a/resources_servers/equivalence_llm_judge/app.py
+++ b/resources_servers/equivalence_llm_judge/app.py
@@ -21,7 +21,9 @@
# limitations under the License.
from __future__ import annotations
+import asyncio
import re
+from contextlib import nullcontext
from typing import Any, Optional
from fastapi import FastAPI
@@ -59,8 +61,11 @@ class LLMJudgeResourcesServerConfig(BaseResourcesServerConfig):
judge_model_server: ModelServerRef
judge_responses_create_params: NeMoGymResponseCreateParamsNonStreaming
+ # Concurrency limit for judge endpoint requests. Set to None to disable limiting.
+ judge_endpoint_max_concurrency: Optional[int] = 64
+
judge_system_message: Optional[str] = None
- judge_prompt_template: str
+ judge_prompt_template_fpath: str = "prompt_templates/equivalence_llm_judge.txt"
judge_equal_label: str = "[[A=B]]"
judge_not_equal_label: str = "[[A!=B]]"
# Optional regex to extract the question from the last user message.
@@ -250,6 +255,17 @@ class LLMJudgeResourcesServer(SimpleResourcesServer):
config: LLMJudgeResourcesServerConfig
+ def __init__(self, *args, **kwargs):
+ super().__init__(*args, **kwargs)
+
+ if self.config.judge_endpoint_max_concurrency is not None:
+ self._judge_endpoint_max_concurrency = asyncio.Semaphore(value=self.config.judge_endpoint_max_concurrency)
+ else:
+ self._judge_endpoint_max_concurrency = nullcontext()
+
+ with open(self.config.judge_prompt_template_fpath, "r") as f:
+ self._judge_prompt_template = f.read().strip()
+
def setup_webserver(self) -> FastAPI:
app = super().setup_webserver()
return app
@@ -420,7 +436,7 @@ async def _generate_judge_evaluation(
not_equal_label = cfg.judge_not_equal_label
responses_create_params = cfg.judge_responses_create_params.model_copy(deep=True)
- prompt_template = cfg.judge_prompt_template
+ prompt_template = self._judge_prompt_template
system_message = cfg.judge_system_message
user_prompt = prompt_template.format(
@@ -433,12 +449,21 @@ async def _generate_judge_evaluation(
msgs.append(NeMoGymEasyInputMessage(role="user", content=user_prompt))
responses_create_params.input = msgs
- response = await self.server_client.post(
- server_name=cfg.judge_model_server.name,
- url_path="/v1/responses",
- json=responses_create_params,
- )
- judge_response = NeMoGymResponse.model_validate(await get_response_json(response))
+ async with self._judge_endpoint_max_concurrency:
+ try:
+ response = await self.server_client.post(
+ server_name=cfg.judge_model_server.name,
+ url_path="/v1/responses",
+ json=responses_create_params,
+ )
+ judge_response = NeMoGymResponse.model_validate(await get_response_json(response))
+ except Exception as e:
+ print(
+ f"DEBUG: LLMJudgeResourcesServer: judge model server HTTP POST error: {type(e).__name__} {e}",
+ flush=True,
+ )
+ raise e
+
eval_record = JudgeEvaluation(
responses_create_params=responses_create_params,
response=judge_response,
diff --git a/resources_servers/equivalence_llm_judge/configs/equivalence_llm_judge.yaml b/resources_servers/equivalence_llm_judge/configs/equivalence_llm_judge.yaml
index 55bfb96d2..d2275fe85 100644
--- a/resources_servers/equivalence_llm_judge/configs/equivalence_llm_judge.yaml
+++ b/resources_servers/equivalence_llm_judge/configs/equivalence_llm_judge.yaml
@@ -7,64 +7,9 @@ equivalence_llm_judge:
name: policy_model
judge_responses_create_params:
input: []
- judge_prompt_template: |-
- ===== System role =====
- You are a meticulous STEM grader. Compare a candidate answer to a GOLD reference for a scientific question and decide strict equivalence.
-
- Grading priorities (in order):
- 1) Factual equivalence to GOLD (accept algebraically/formally equivalent formulations).
- 2) Completeness on required parts — the candidate must include the same core parts/subclaims as the GOLD.
-
- Rules:
- - Treat GOLD as authoritative for what counts as correct.
- - If GOLD is a range or set, the candidate is equivalent only if it lies within that range or is a member of that set.
- - For formulas/derivations, accept mathematically identical transformations (e.g., symbol reordering, factoring, equivalent identities).
- - Multi-part: all essential parts must match for “equivalent”; otherwise they are not equivalent.
- - Be concise. Do NOT reveal or rewrite the GOLD.
-
- Show your reason why they are equivalent or not equivalent first and then provide the output.
-
- Output (at the end after double newlines):
- - If equivalent: [[A=B]] they are equivalent
- - If not equivalent: [[A!=B]] they are not equivalent
-
- ===== Example 1 (equivalent) =====
- QUESTION:
- State Avogadro’s constant (include units).
-
- GOLD:
- 6.022 × 10^23 mol^-1
-
- CANDIDATE:
- 6.022e23 per mole.
-
- The candidate gives the same magnitude in scientific notation and the same “per mole” unit; no extra or missing qualifiers.
-
- [[A=B]] they are equivalent
-
- ===== Example 2 (not equivalent) =====
- QUESTION:
- State the first law of thermodynamics for a closed system and identify what each symbol represents.
-
- GOLD:
- ΔU = Q − W ; ΔU is change in internal energy, Q is heat added to the system, W is work done by the system.
-
- CANDIDATE:
- ΔU = Q + W ; ΔU is change in internal energy, Q is heat added to the system, W is work done by the system.
-
- The candidate uses the opposite sign convention for work relative to the required relationship; one core part is incorrect, so the overall statement does not match.
-
- [[A!=B]] they are not equivalent
-
- ===== Inputs =====
- QUESTION:
- {question}
-
- GOLD:
- {expected_answer}
-
- CANDIDATE:
- {generated_answer}
+ judge_prompt_template_fpath: prompt_templates/equivalence_llm_judge.txt
+ # Concurrency and rate limiting configuration
+ judge_endpoint_max_concurrency: 64 # Reduced from 64 to avoid rate limits
judge_system_message: null
judge_equal_label: "[[A=B]]"
judge_not_equal_label: "[[A!=B]]"
@@ -135,7 +80,7 @@ equivalence_llm_judge_simple_agent:
license: "TBD"
jsonl_fpath: resources_servers/equivalence_llm_judge/data/example_openqa.jsonl
- name: train
- type: train
+ type: example
license: Apache 2.0
jsonl_fpath: resources_servers/equivalence_llm_judge/data/train.jsonl
huggingface_identifier:
diff --git a/resources_servers/equivalence_llm_judge/configs/lc.yaml b/resources_servers/equivalence_llm_judge/configs/lc.yaml
new file mode 120000
index 000000000..b8218928a
--- /dev/null
+++ b/resources_servers/equivalence_llm_judge/configs/lc.yaml
@@ -0,0 +1 @@
+lc_judge.yaml
\ No newline at end of file
diff --git a/resources_servers/equivalence_llm_judge/configs/lc_judge.yaml b/resources_servers/equivalence_llm_judge/configs/lc_judge.yaml
new file mode 100644
index 000000000..db3843b50
--- /dev/null
+++ b/resources_servers/equivalence_llm_judge/configs/lc_judge.yaml
@@ -0,0 +1,35 @@
+lc_judge:
+ resources_servers:
+ equivalence_llm_judge:
+ entrypoint: app.py
+ judge_model_server:
+ type: responses_api_models
+ name: policy_model
+ judge_responses_create_params:
+ input: []
+ judge_prompt_template_fpath: prompt_templates/lc_judge.txt
+ judge_endpoint_max_concurrency: 64
+ judge_system_message: null
+ judge_equal_label: CORRECT
+ judge_not_equal_label: INCORRECT
+ check_twice_swap: false
+ reward_if_swap_fails: 0.0
+ question_extract_regex: ^QUESTION:\s*(.*)$
+ response_extract_regex: null
+ domain: knowledge
+ verified: false
+lc_judge_simple_agent:
+ responses_api_agents:
+ simple_agent:
+ entrypoint: app.py
+ resources_server:
+ type: resources_servers
+ name: lc_judge
+ model_server:
+ type: responses_api_models
+ name: policy_model
+ datasets:
+ - name: example
+ type: example
+ license: TBD
+ jsonl_fpath: resources_servers/equivalence_llm_judge/data/example.jsonl
diff --git a/resources_servers/equivalence_llm_judge/prompt_templates/equivalence_llm_judge.txt b/resources_servers/equivalence_llm_judge/prompt_templates/equivalence_llm_judge.txt
new file mode 100644
index 000000000..7a7de001b
--- /dev/null
+++ b/resources_servers/equivalence_llm_judge/prompt_templates/equivalence_llm_judge.txt
@@ -0,0 +1,58 @@
+
+===== System role =====
+You are a meticulous STEM grader. Compare a candidate answer to a GOLD reference for a scientific question and decide strict equivalence.
+
+Grading priorities (in order):
+1) Factual equivalence to GOLD (accept algebraically/formally equivalent formulations).
+2) Completeness on required parts — the candidate must include the same core parts/subclaims as the GOLD.
+
+Rules:
+- Treat GOLD as authoritative for what counts as correct.
+- If GOLD is a range or set, the candidate is equivalent only if it lies within that range or is a member of that set.
+- For formulas/derivations, accept mathematically identical transformations (e.g., symbol reordering, factoring, equivalent identities).
+- Multi-part: all essential parts must match for “equivalent”; otherwise they are not equivalent.
+- Be concise. Do NOT reveal or rewrite the GOLD.
+
+Show your reason why they are equivalent or not equivalent first and then provide the output.
+
+Output (at the end after double newlines):
+- If equivalent: [[A=B]] they are equivalent
+- If not equivalent: [[A!=B]] they are not equivalent
+
+===== Example 1 (equivalent) =====
+QUESTION:
+State Avogadro’s constant (include units).
+
+GOLD:
+6.022 × 10^23 mol^-1
+
+CANDIDATE:
+6.022e23 per mole.
+
+The candidate gives the same magnitude in scientific notation and the same “per mole” unit; no extra or missing qualifiers.
+
+[[A=B]] they are equivalent
+
+===== Example 2 (not equivalent) =====
+QUESTION:
+State the first law of thermodynamics for a closed system and identify what each symbol represents.
+
+GOLD:
+ΔU = Q − W ; ΔU is change in internal energy, Q is heat added to the system, W is work done by the system.
+
+CANDIDATE:
+ΔU = Q + W ; ΔU is change in internal energy, Q is heat added to the system, W is work done by the system.
+
+The candidate uses the opposite sign convention for work relative to the required relationship; one core part is incorrect, so the overall statement does not match.
+
+[[A!=B]] they are not equivalent
+
+===== Inputs =====
+QUESTION:
+{question}
+
+GOLD:
+{expected_answer}
+
+CANDIDATE:
+{generated_answer}
\ No newline at end of file
diff --git a/resources_servers/equivalence_llm_judge/prompt_templates/lc.txt b/resources_servers/equivalence_llm_judge/prompt_templates/lc.txt
new file mode 120000
index 000000000..b200b884c
--- /dev/null
+++ b/resources_servers/equivalence_llm_judge/prompt_templates/lc.txt
@@ -0,0 +1 @@
+lc_judge.txt
\ No newline at end of file
diff --git a/resources_servers/equivalence_llm_judge/prompt_templates/lc_judge.txt b/resources_servers/equivalence_llm_judge/prompt_templates/lc_judge.txt
new file mode 100644
index 000000000..bb44669d3
--- /dev/null
+++ b/resources_servers/equivalence_llm_judge/prompt_templates/lc_judge.txt
@@ -0,0 +1,9 @@
+
+Assess whether the following CANDIDATE ANSWER is CORRECT or INCORRECT.
+For the CANDIDATE ANSWER to be correct, it must be consistent with the OFFICIAL ANSWER.
+
+The question, for reference only: {question}
+The OFFICIAL ANSWER: {expected_answer}
+CANDIDATE ANSWER TO ASSESS: {generated_answer}
+
+Reply only with CORRECT or INCORRECT.
\ No newline at end of file
diff --git a/resources_servers/equivalence_llm_judge/tests/test_app.py b/resources_servers/equivalence_llm_judge/tests/test_app.py
index 83d875ab7..d824d2c9b 100644
--- a/resources_servers/equivalence_llm_judge/tests/test_app.py
+++ b/resources_servers/equivalence_llm_judge/tests/test_app.py
@@ -16,7 +16,6 @@
from pathlib import Path
from unittest.mock import AsyncMock, MagicMock
-from omegaconf import OmegaConf
from pytest import approx, fixture
from nemo_gym.config_types import ModelServerRef
@@ -38,11 +37,8 @@
class TestApp:
@fixture
def config(self) -> LLMJudgeResourcesServerConfig:
- # Load judge template from YAML so tests mirror runtime config
- yaml_path = Path(__file__).resolve().parents[1] / "configs" / "equivalence_llm_judge.yaml"
- yaml_cfg = OmegaConf.load(str(yaml_path))
- judge_template: str = (
- yaml_cfg.equivalence_llm_judge.resources_servers.equivalence_llm_judge.judge_prompt_template
+ judge_prompt_template_fpath = str(
+ Path(__file__).resolve().parents[1] / "prompt_templates/equivalence_llm_judge.txt"
)
cfg = LLMJudgeResourcesServerConfig(
@@ -51,7 +47,7 @@ def config(self) -> LLMJudgeResourcesServerConfig:
entrypoint="",
judge_model_server=ModelServerRef(type="responses_api_models", name="judge"),
judge_responses_create_params=NeMoGymResponseCreateParamsNonStreaming(input=[]),
- judge_prompt_template=judge_template,
+ judge_prompt_template_fpath=judge_prompt_template_fpath,
)
cfg.judge_equal_label = "[[A=B]]"
cfg.judge_not_equal_label = "[[A!=B]]"
diff --git a/resources_servers/multichallenge/.gitignore b/resources_servers/multichallenge/.gitignore
new file mode 100644
index 000000000..ef5570e77
--- /dev/null
+++ b/resources_servers/multichallenge/.gitignore
@@ -0,0 +1,6 @@
+# Ignore data files (datasets should not be committed)
+data/advanced/
+data/vanilla/
+*.json
+*.jsonl
+!configs/*.yaml
diff --git a/resources_servers/multichallenge/README.md b/resources_servers/multichallenge/README.md
new file mode 100644
index 000000000..6e877d942
--- /dev/null
+++ b/resources_servers/multichallenge/README.md
@@ -0,0 +1,313 @@
+# MultiChallenge Environment
+
+Evaluates model responses on the **MultiChallenge** benchmark using an LLM judge. This benchmark assesses multi-turn conversation quality through rubric-based evaluation.
+
+## Quick Start
+
+```bash
+# 1. Run unit tests
+ng_test +entrypoint=resources_servers/multichallenge
+
+# 2. Start servers (in terminal 1)
+config_paths="resources_servers/multichallenge/configs/multichallenge.yaml,responses_api_models/vllm_model/configs/vllm_model.yaml"
+ng_run "+config_paths=[${config_paths}]"
+
+# 3. Collect rollouts on example data (in terminal 2)
+ng_collect_rollouts \
+ +agent_name=multichallenge_simple_agent \
+ +input_jsonl_fpath=resources_servers/multichallenge/data/example.jsonl \
+ +output_jsonl_fpath=/tmp/multichallenge_rollouts.jsonl
+```
+
+## Overview
+
+Each MultiChallenge task contains:
+- **Conversation context**: A multi-turn dialogue between user and assistant
+- **Rubric**: A set of yes/no questions evaluating the final response quality
+- **Metadata**: Task information including topic, challenge type, and persona
+
+The environment:
+1. Feeds the conversation context to the policy model
+2. Retrieves the final response (excluding thinking/reasoning blocks)
+3. Evaluates each rubric question using an LLM judge
+4. Aggregates scores using a configurable method (mean, min, all, etc.)
+
+## Data Preparation
+
+### Option A: Use Example Data Only (Quick Testing)
+
+The `data/example.jsonl` file contains 3 synthetic tasks ready to use:
+
+```bash
+# No preprocessing needed - just run
+ng_collect_rollouts \
+ +agent_name=multichallenge_simple_agent \
+ +input_jsonl_fpath=resources_servers/multichallenge/data/example.jsonl \
+ +output_jsonl_fpath=/tmp/test_rollouts.jsonl
+```
+
+### Option B: Full Dataset Setup
+
+> **Important**: Run the preprocessing script **before launching training jobs**.
+> The preprocessed JSONL files must exist in `data/` for the training pipeline to work.
+
+1. **Preprocess to JSONL format**:
+ ```bash
+ # Run from the multichallenge directory
+ cd resources_servers/multichallenge
+ python dataset_preprocess.py
+ ```
+
+ This reads from the raw data directory and outputs:
+ - `data/advanced.jsonl` (994 tasks)
+ - `data/vanilla.jsonl` (1023 tasks)
+
+ The script supports two input modes:
+ - `--mode jsonl` (default): Reads pre-compiled `{split}.jsonl` files
+ - `--mode json-dir`: Reads individual `{split}/*.json` files from directories
+
+ ```bash
+ # Custom input/output paths
+ python dataset_preprocess.py \
+ --data-dir /path/to/raw/data \
+ --output-dir ./data \
+ --splits advanced vanilla
+ ```
+
+2. **Run on full dataset**:
+ ```bash
+ ng_collect_rollouts \
+ +agent_name=multichallenge_simple_agent \
+ +input_jsonl_fpath=resources_servers/multichallenge/data/advanced.jsonl \
+ +output_jsonl_fpath=/tmp/advanced_rollouts.jsonl
+ ```
+
+## Testing
+
+### Unit Tests
+
+```bash
+# Run all unit tests
+ng_test +entrypoint=resources_servers/multichallenge
+
+# Or run directly with pytest for more detail
+cd resources_servers/multichallenge
+source .venv/bin/activate
+pytest -v
+```
+
+Tests cover:
+- Verdict extraction (`[[YES]]`/`[[NO]]`)
+- Context building (excluding thinking messages)
+- Score aggregation (mean, min, max, all, any, weighted)
+
+### End-to-End Sanity Test
+
+1. **Start servers**:
+ ```bash
+ config_paths="resources_servers/multichallenge/configs/multichallenge.yaml,responses_api_models/vllm_model/configs/vllm_model.yaml"
+ ng_run "+config_paths=[${config_paths}]"
+ ```
+
+2. **In another terminal, run on example data**:
+ ```bash
+ ng_collect_rollouts \
+ +agent_name=multichallenge_simple_agent \
+ +input_jsonl_fpath=resources_servers/multichallenge/data/example.jsonl \
+ +output_jsonl_fpath=/tmp/multichallenge_rollouts.jsonl \
+ +limit=3
+ ```
+
+3. **View results**:
+ ```bash
+ cat /tmp/multichallenge_rollouts.jsonl | python -c "
+ import json, sys
+ for line in sys.stdin:
+ d = json.loads(line)
+ print(f\"Reward: {d.get('reward')} | Passed: {d.get('num_passed')}/{d.get('num_total')}\")
+ "
+ ```
+
+## Configuration
+
+### Basic Setup
+
+```yaml
+multichallenge:
+ resources_servers:
+ multichallenge:
+ entrypoint: app.py
+
+ # Judge model configuration
+ judge_model_server:
+ type: responses_api_models
+ name: policy_model # or a dedicated judge model
+
+ # Judge request parameters
+ judge_responses_create_params:
+ input: []
+ max_output_tokens: 512
+ temperature: 0.0
+
+ # Score aggregation: mean | min | max | all | any | weighted
+ aggregation_mode: mean
+```
+
+### Aggregation Modes
+
+| Mode | Description |
+|------|-------------|
+| `mean` | Average of all rubric scores |
+| `min` | Minimum score (strictest) |
+| `max` | Maximum score (most lenient) |
+| `all` | All items must pass (binary: 0 or 1) |
+| `any` | Any item passes (binary: 0 or 1) |
+| `weighted` | Weighted average using rubric item weights |
+
+### Custom Judge Prompt
+
+You can customize the judge prompt template:
+
+```yaml
+judge_prompt_template: |-
+ You are evaluating whether a model's response meets a specific criterion.
+
+ CONVERSATION CONTEXT:
+ {context}
+
+ MODEL'S FINAL RESPONSE:
+ {response}
+
+ EVALUATION QUESTION:
+ {question}
+
+ EXPECTED ANSWER: {pass_criteria}
+
+ Respond with [[YES]] or [[NO]].
+```
+
+Placeholders:
+- `{context}`: Full conversation history
+- `{response}`: The model's final response
+- `{question}`: The rubric evaluation question
+- `{pass_criteria}`: Expected answer (usually "YES")
+
+## Data Format
+
+### Raw JSON Format (Input)
+
+Each task file contains:
+
+```json
+{
+ "metadata": {
+ "taskId": 12345,
+ "topic": "Education & Research",
+ "challenge": "Inference Memory"
+ },
+ "system": "Optional system prompt",
+ "messages": [
+ {"role": "user", "content": "..."},
+ {"role": "thinking", "content": "..."},
+ {"role": "assistant", "content": "..."}
+ ],
+ "rubric": [
+ {
+ "question": "Did the model correctly remember X?",
+ "pass_criteria": "YES"
+ }
+ ]
+}
+```
+
+### Preprocessed JSONL Format (Output)
+
+Each line contains:
+
+```json
+{
+ "uuid": "12345",
+ "task_id": 12345,
+ "responses_create_params": {
+ "input": [{"role": "user", "content": "..."}]
+ },
+ "rubric": [...],
+ "context": "[USER]: ...\n\n[ASSISTANT]: ...",
+ "metadata": {...}
+}
+```
+
+Key transformations:
+- `thinking` role messages are excluded from input
+- `context` is pre-formatted for the LLM judge
+- `responses_create_params` wraps input for `ng_collect_rollouts`
+
+## File Structure
+
+```
+multichallenge/
+├── app.py # Main server implementation
+├── dataset_preprocess.py # JSON → JSONL converter
+├── requirements.txt # Dependencies (-e nemo-gym[dev])
+├── README.md # This file
+├── .gitignore # Excludes data from git
+├── configs/
+│ └── multichallenge.yaml # Server + agent configuration
+├── data/
+│ ├── example.jsonl # Example data (3 tasks, committed)
+│ ├── advanced/ # Raw JSON files (ignored)
+│ ├── vanilla/ # Raw JSON files (ignored)
+│ ├── advanced.jsonl # Preprocessed (generated, ignored)
+│ ├── vanilla.jsonl # Preprocessed (generated, ignored)
+│ ├── .gitignore
+│ └── README.md
+└── tests/
+ ├── __init__.py
+ └── test_multichallenge.py
+```
+
+## API Endpoints
+
+- `POST /verify` - Evaluate a model response against the rubric
+- `POST /seed_session` - Initialize a new session
+
+### Verify Response
+
+```json
+{
+ "reward": 0.75,
+ "generated_response": "...",
+ "rubric_evaluations": [
+ {
+ "question": "...",
+ "pass_criteria": "YES",
+ "verdict": "YES",
+ "score": 1.0
+ }
+ ],
+ "num_passed": 3,
+ "num_total": 4,
+ "aggregation_mode": "mean"
+}
+```
+
+## Example Rubric Evaluation
+
+Given a conversation about travel planning where the user mentioned a seafood allergy:
+
+```json
+{
+ "question": "Did the model correctly remember that the user is allergic to seafood and avoid recommending seafood dishes?",
+ "pass_criteria": "YES"
+}
+```
+
+The LLM judge analyzes the model's response and returns `[[YES]]` or `[[NO]]`.
+
+---
+
+**Note**: The default raw data path is hardcoded in `dataset_preprocess.py`:
+```
+/lustre/fsw/portfolios/llmservice/users/mfathi/data/multichallenge
+```
+Update `DEFAULT_RAW_DATA_DIR` in the script or use `--data-dir` to specify a different location.
diff --git a/resources_servers/multichallenge/app.py b/resources_servers/multichallenge/app.py
new file mode 100644
index 000000000..cb039a1a4
--- /dev/null
+++ b/resources_servers/multichallenge/app.py
@@ -0,0 +1,382 @@
+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+MultiChallenge Environment Resources Server.
+
+Evaluates model responses on the MultiChallenge benchmark using an LLM judge.
+Each task contains a conversation context and a rubric of yes/no questions
+that assess the quality of the final assistant response.
+
+This environment:
+1. Loads tasks from configurable splits (e.g., "advanced", "vanilla")
+2. Feeds conversation context to the policy model
+3. Retrieves the final response (excluding thinking parts)
+4. Evaluates against each rubric entry using an LLM judge
+5. Aggregates scores using a configurable method
+"""
+
+from __future__ import annotations
+
+import re
+from enum import Enum
+from typing import Any, List, Optional
+
+from fastapi import FastAPI
+from pydantic import BaseModel, ConfigDict, Field
+
+from nemo_gym.base_resources_server import (
+ BaseResourcesServerConfig,
+ BaseRunRequest,
+ BaseVerifyRequest,
+ BaseVerifyResponse,
+ SimpleResourcesServer,
+)
+from nemo_gym.config_types import ModelServerRef
+from nemo_gym.openai_utils import (
+ NeMoGymEasyInputMessage,
+ NeMoGymResponse,
+ NeMoGymResponseCreateParamsNonStreaming,
+)
+
+
+class AggregationMode(str, Enum):
+ """How to aggregate rubric scores into a final reward."""
+
+ # Average of all rubric scores
+ MEAN = "mean"
+ # Minimum score across all rubric items (strict)
+ MIN = "min"
+ # Maximum score across all rubric items (lenient)
+ MAX = "max"
+ # All rubric items must pass (product of binary scores)
+ ALL = "all"
+ # Any rubric item passes (max of binary scores)
+ ANY = "any"
+ # Weighted average (requires weights in rubric items)
+ WEIGHTED = "weighted"
+
+
+class RubricEvaluation(BaseModel):
+ """Result of evaluating a single rubric item."""
+
+ question: str
+ pass_criteria: str
+ judge_prompt: str
+ judge_response: str
+ verdict: str # "YES" or "NO"
+ score: float # 1.0 for pass, 0.0 for fail
+ weight: float = 1.0
+
+
+class MultiChallengeConfig(BaseResourcesServerConfig):
+ """Configuration for the MultiChallenge environment server."""
+
+ name: str = "multichallenge"
+
+ # Reference to the judge model server
+ judge_model_server: ModelServerRef = Field(description="Reference to the model server used as the LLM judge")
+
+ # Parameters for judge requests
+ judge_responses_create_params: NeMoGymResponseCreateParamsNonStreaming = Field(
+ description="Base parameters for judge model requests"
+ )
+
+ # Aggregation mode for combining rubric scores
+ aggregation_mode: AggregationMode = Field(
+ default=AggregationMode.MEAN, description="How to aggregate scores from multiple rubric items"
+ )
+
+ # Template for the judge prompt
+ judge_prompt_template: str = Field(
+ default="""You are evaluating whether a model's response meets a specific criterion.
+
+CONVERSATION CONTEXT:
+{context}
+
+MODEL'S FINAL RESPONSE:
+{response}
+
+EVALUATION QUESTION:
+{question}
+
+EXPECTED ANSWER: {pass_criteria}
+
+Does the model's response satisfy the criterion described in the evaluation question?
+Analyze carefully, then respond with exactly [[YES]] or [[NO]] on the last line.""",
+ description="Template for the judge evaluation prompt",
+ )
+
+ # System message for the judge
+ judge_system_message: Optional[str] = Field(
+ default="You are a precise evaluator. Assess responses objectively based on the given criteria.",
+ description="Optional system message for the judge",
+ )
+
+ # Whether to run rubric evaluations in parallel
+ parallel_evaluation: bool = Field(default=True, description="Whether to evaluate rubric items in parallel")
+
+ # Labels for verdict extraction
+ yes_label: str = Field(default="[[YES]]", description="Label indicating YES verdict")
+ no_label: str = Field(default="[[NO]]", description="Label indicating NO verdict")
+
+
+class MultiChallengeRunRequest(BaseRunRequest):
+ """Run request payload for MultiChallenge tasks."""
+
+ model_config = ConfigDict(extra="allow")
+
+ uuid: Optional[str | int] = None
+ task_id: Optional[int] = None
+ rubric: Optional[List[dict]] = None
+ context: Optional[str] = None
+ metadata: Optional[dict[str, Any]] = None
+
+
+class MultiChallengeVerifyRequest(MultiChallengeRunRequest, BaseVerifyRequest):
+ """Verify request that includes the model's response."""
+
+ pass
+
+
+class MultiChallengeVerifyResponse(BaseVerifyResponse):
+ """Response with detailed rubric evaluations."""
+
+ model_config = ConfigDict(extra="allow")
+
+ context: str
+ generated_response: str
+ rubric_evaluations: List[RubricEvaluation]
+ aggregation_mode: str
+ num_passed: int
+ num_total: int
+
+
+def _extract_text_from_response(response: NeMoGymResponse, exclude_thinking: bool = True) -> str:
+ """Extract text content from the last assistant message, optionally excluding thinking."""
+ for output in reversed(response.output):
+ if getattr(output, "type", None) == "message" and getattr(output, "role", None) == "assistant":
+ content = getattr(output, "content", None)
+ if isinstance(content, list):
+ texts = []
+ for c in content:
+ text = getattr(c, "text", None)
+ if isinstance(text, str):
+ texts.append(text)
+ full_text = "\n".join(texts).strip()
+ elif isinstance(content, str):
+ full_text = content.strip()
+ else:
+ continue
+
+ if exclude_thinking:
+ # Remove ... blocks
+ full_text = re.sub(r".*?", "", full_text, flags=re.DOTALL)
+ # Also remove ... blocks
+ full_text = re.sub(r".*?", "", full_text, flags=re.DOTALL)
+
+ return full_text.strip()
+ return ""
+
+
+def _build_context_from_messages(messages: List[dict], exclude_thinking: bool = True) -> str:
+ """Build a readable context string from the message history."""
+ context_parts = []
+ for msg in messages:
+ role = msg.get("role", "unknown")
+ content = msg.get("content", "")
+
+ # Skip thinking messages
+ if exclude_thinking and role == "thinking":
+ continue
+
+ role_label = role.upper()
+ context_parts.append(f"[{role_label}]: {content}")
+
+ return "\n\n".join(context_parts)
+
+
+def _extract_verdict(response_text: str, yes_label: str, no_label: str) -> str:
+ """Extract YES/NO verdict from judge response."""
+ # Look for the labels in the response
+ yes_pos = response_text.rfind(yes_label)
+ no_pos = response_text.rfind(no_label)
+
+ if yes_pos < 0 and no_pos < 0:
+ # Fallback: look for plain YES/NO at end of response
+ lines = response_text.strip().split("\n")
+ last_line = lines[-1].strip().upper() if lines else ""
+ if "YES" in last_line:
+ return "YES"
+ elif "NO" in last_line:
+ return "NO"
+ return "NO" # Default to NO if unclear
+
+ # Return whichever appears last (most authoritative)
+ if yes_pos > no_pos:
+ return "YES"
+ return "NO"
+
+
+class MultiChallengeServer(SimpleResourcesServer):
+ """MultiChallenge evaluation server."""
+
+ config: MultiChallengeConfig
+
+ def setup_webserver(self) -> FastAPI:
+ app = super().setup_webserver()
+ return app
+
+ async def verify(self, body: MultiChallengeVerifyRequest) -> MultiChallengeVerifyResponse:
+ """Verify model response against the rubric using LLM judge."""
+
+ # Extract the generated response (without thinking)
+ generated_response = _extract_text_from_response(body.response, exclude_thinking=True)
+
+ # Get context from the request or build from messages if available
+ context = body.context or ""
+ if not context and body.metadata and "messages" in body.metadata:
+ context = _build_context_from_messages(body.metadata["messages"])
+
+ # Get rubric from request
+ rubric = body.rubric or []
+ if not rubric and body.metadata and "rubric" in body.metadata:
+ rubric = body.metadata["rubric"]
+
+ # Evaluate each rubric item
+ if self.config.parallel_evaluation and len(rubric) > 1:
+ import asyncio
+
+ evaluations = await asyncio.gather(
+ *[self._evaluate_rubric_item(item, context, generated_response) for item in rubric]
+ )
+ else:
+ evaluations = []
+ for item in rubric:
+ eval_result = await self._evaluate_rubric_item(item, context, generated_response)
+ evaluations.append(eval_result)
+
+ # Aggregate scores
+ reward = self._aggregate_scores(evaluations)
+ num_passed = sum(1 for e in evaluations if e.score >= 0.99)
+
+ # Build response
+ payload = body.model_dump()
+ payload.pop("context", None)
+ payload.pop("rubric", None)
+
+ return MultiChallengeVerifyResponse(
+ **payload,
+ reward=reward,
+ context=context,
+ generated_response=generated_response,
+ rubric_evaluations=evaluations,
+ aggregation_mode=self.config.aggregation_mode.value,
+ num_passed=num_passed,
+ num_total=len(evaluations),
+ )
+
+ async def _evaluate_rubric_item(self, item: dict, context: str, response: str) -> RubricEvaluation:
+ """Evaluate a single rubric item using the LLM judge."""
+
+ question = item.get("question", "")
+ pass_criteria = item.get("pass_criteria", "YES")
+ weight = item.get("weight", 1.0)
+
+ # Format the judge prompt
+ judge_prompt = self.config.judge_prompt_template.format(
+ context=context,
+ response=response,
+ question=question,
+ pass_criteria=pass_criteria,
+ )
+
+ # Build messages for judge
+ msgs: List[NeMoGymEasyInputMessage] = []
+ if self.config.judge_system_message:
+ msgs.append(NeMoGymEasyInputMessage(role="system", content=self.config.judge_system_message))
+ msgs.append(NeMoGymEasyInputMessage(role="user", content=judge_prompt))
+
+ # Create request parameters
+ request_params = self.config.judge_responses_create_params.model_copy(deep=True)
+ request_params.input = msgs
+
+ # Call judge model
+ response_obj = await self.server_client.post(
+ server_name=self.config.judge_model_server.name,
+ url_path="/v1/responses",
+ json=request_params,
+ )
+ judge_response = NeMoGymResponse.model_validate(await response_obj.json())
+ judge_text = _extract_text_from_response(judge_response, exclude_thinking=True)
+
+ # Extract verdict
+ verdict = _extract_verdict(judge_text, self.config.yes_label, self.config.no_label)
+
+ # Score based on whether verdict matches expected criteria
+ if pass_criteria.upper() == "YES":
+ score = 1.0 if verdict == "YES" else 0.0
+ elif pass_criteria.upper() == "NO":
+ score = 1.0 if verdict == "NO" else 0.0
+ else:
+ # For other criteria, treat YES as success
+ score = 1.0 if verdict == "YES" else 0.0
+
+ return RubricEvaluation(
+ question=question,
+ pass_criteria=pass_criteria,
+ judge_prompt=judge_prompt,
+ judge_response=judge_text,
+ verdict=verdict,
+ score=score,
+ weight=weight,
+ )
+
+ def _aggregate_scores(self, evaluations: List[RubricEvaluation]) -> float:
+ """Aggregate rubric scores into final reward."""
+ if not evaluations:
+ return 0.0
+
+ scores = [e.score for e in evaluations]
+ weights = [e.weight for e in evaluations]
+
+ mode = self.config.aggregation_mode
+
+ if mode == AggregationMode.MEAN:
+ return sum(scores) / len(scores)
+
+ elif mode == AggregationMode.MIN:
+ return min(scores)
+
+ elif mode == AggregationMode.MAX:
+ return max(scores)
+
+ elif mode == AggregationMode.ALL:
+ return 1.0 if all(s >= 0.99 for s in scores) else 0.0
+
+ elif mode == AggregationMode.ANY:
+ return 1.0 if any(s >= 0.99 for s in scores) else 0.0
+
+ elif mode == AggregationMode.WEIGHTED:
+ total_weight = sum(weights)
+ if total_weight == 0:
+ return 0.0
+ weighted_sum = sum(s * w for s, w in zip(scores, weights))
+ return weighted_sum / total_weight
+
+ return 0.0
+
+
+if __name__ == "__main__":
+ MultiChallengeServer.run_webserver()
diff --git a/resources_servers/multichallenge/configs/multichallenge.yaml b/resources_servers/multichallenge/configs/multichallenge.yaml
new file mode 100644
index 000000000..7cb7ec74c
--- /dev/null
+++ b/resources_servers/multichallenge/configs/multichallenge.yaml
@@ -0,0 +1,72 @@
+multichallenge:
+ resources_servers:
+ multichallenge:
+ entrypoint: app.py
+ judge_model_server:
+ type: responses_api_models
+ name: policy_model
+ judge_responses_create_params:
+ input: []
+ max_output_tokens: 8192
+ temperature: 0.7
+ top_p: 0.8
+ aggregation_mode: mean
+ parallel_evaluation: true
+ judge_system_message: You are a precise evaluator. Assess responses objectively
+ based on the given criteria. Analyze the response carefully against the evaluation
+ question.
+ judge_prompt_template: 'You are evaluating whether a model''s response meets
+ a specific criterion.
+
+
+ CONVERSATION CONTEXT:
+
+ {context}
+
+
+ MODEL''S FINAL RESPONSE:
+
+ {response}
+
+
+ EVALUATION QUESTION:
+
+ {question}
+
+
+ EXPECTED ANSWER: {pass_criteria}
+
+
+ Does the model''s response satisfy the criterion described in the evaluation
+ question?
+
+ Think step by step, then respond with exactly [[YES]] or [[NO]] on the last
+ line.'
+ yes_label: '[[YES]]'
+ no_label: '[[NO]]'
+ domain: knowledge
+ description: MultiChallenge benchmark evaluation with LLM judge
+ verified: false
+multichallenge_simple_agent:
+ responses_api_agents:
+ simple_agent:
+ entrypoint: app.py
+ resources_server:
+ type: resources_servers
+ name: multichallenge
+ model_server:
+ type: responses_api_models
+ name: policy_model
+ datasets:
+ - name: multichallenge_example
+ type: example
+ license: Apache 2.0
+ jsonl_fpath: resources_servers/multichallenge/data/example.jsonl
+ - name: multichallenge_advanced
+ type: train
+ license: TBD
+ jsonl_fpath: resources_servers/multichallenge/data/advanced.jsonl
+ - name: multichallenge_vanilla
+ type: train
+ license: TBD
+ jsonl_fpath: resources_servers/multichallenge/data/vanilla.jsonl
diff --git a/resources_servers/multichallenge/configs/multichallenge_nrl.yaml b/resources_servers/multichallenge/configs/multichallenge_nrl.yaml
new file mode 100644
index 000000000..7b67aa0f0
--- /dev/null
+++ b/resources_servers/multichallenge/configs/multichallenge_nrl.yaml
@@ -0,0 +1,72 @@
+multichallenge:
+ resources_servers:
+ multichallenge:
+ entrypoint: app.py
+ judge_model_server:
+ type: responses_api_models
+ name: policy_model
+ judge_responses_create_params:
+ input: []
+ max_output_tokens: 512
+ temperature: 1.0
+ top_p: 1.0
+ aggregation_mode: mean
+ parallel_evaluation: true
+ judge_system_message: You are a precise evaluator. Assess responses objectively
+ based on the given criteria. Analyze the response carefully against the evaluation
+ question.
+ judge_prompt_template: 'You are evaluating whether a model''s response meets
+ a specific criterion.
+
+
+ CONVERSATION CONTEXT:
+
+ {context}
+
+
+ MODEL''S FINAL RESPONSE:
+
+ {response}
+
+
+ EVALUATION QUESTION:
+
+ {question}
+
+
+ EXPECTED ANSWER: {pass_criteria}
+
+
+ Does the model''s response satisfy the criterion described in the evaluation
+ question?
+
+ Think step by step, then respond with exactly [[YES]] or [[NO]] on the last
+ line.'
+ yes_label: '[[YES]]'
+ no_label: '[[NO]]'
+ domain: knowledge
+ description: MultiChallenge benchmark evaluation with LLM judge
+ verified: false
+multichallenge_simple_agent:
+ responses_api_agents:
+ simple_agent:
+ entrypoint: app.py
+ resources_server:
+ type: resources_servers
+ name: multichallenge
+ model_server:
+ type: responses_api_models
+ name: policy_model
+ datasets:
+ - name: multichallenge_example
+ type: example
+ license: Apache 2.0
+ jsonl_fpath: resources_servers/multichallenge/data/example.jsonl
+ - name: multichallenge_advanced
+ type: train
+ license: TBD
+ jsonl_fpath: resources_servers/multichallenge/data/advanced.jsonl
+ - name: multichallenge_vanilla
+ type: train
+ license: TBD
+ jsonl_fpath: resources_servers/multichallenge/data/vanilla.jsonl
diff --git a/resources_servers/multichallenge/data/.gitignore b/resources_servers/multichallenge/data/.gitignore
new file mode 100644
index 000000000..44c53a589
--- /dev/null
+++ b/resources_servers/multichallenge/data/.gitignore
@@ -0,0 +1,17 @@
+# Ignore all data files in this directory
+# The MultiChallenge dataset should be copied here but not committed to git
+
+# Ignore raw data directories
+advanced/
+vanilla/
+
+# Ignore preprocessed JSONL files (generated from JSON files)
+advanced.jsonl
+vanilla.jsonl
+
+# Keep these files
+!.gitignore
+!README.md
+!example.jsonl
+!example_rollouts.jsonl
+!example_metrics.json
diff --git a/resources_servers/multichallenge/data/README.md b/resources_servers/multichallenge/data/README.md
new file mode 100644
index 000000000..d8db78eb6
--- /dev/null
+++ b/resources_servers/multichallenge/data/README.md
@@ -0,0 +1,131 @@
+# MultiChallenge Data Directory
+
+This directory contains the MultiChallenge benchmark dataset.
+
+## Quick Start
+
+```bash
+# Option A: Use example data only (no setup needed)
+# The example.jsonl file is ready to use for testing
+
+# Option B: Full dataset setup
+# 1. Copy raw data
+cp -r /path/to/multichallenge/advanced ./advanced
+cp -r /path/to/multichallenge/vanilla ./vanilla
+
+# 2. Preprocess to JSONL (run from parent directory)
+cd ..
+python dataset_preprocess.py
+```
+
+## Directory Structure
+
+```
+data/
+├── example.jsonl # Example dataset (3 tasks, committed to git)
+├── advanced/ # Raw JSON task files (113 tasks, ignored)
+│ └── *.json
+├── vanilla/ # Raw JSON task files (111 tasks, ignored)
+│ └── *.json
+├── advanced.jsonl # Preprocessed dataset (generated, ignored)
+├── vanilla.jsonl # Preprocessed dataset (generated, ignored)
+├── .gitignore # Excludes data files from git
+└── README.md # This file
+```
+
+## Example Dataset
+
+The `example.jsonl` file contains 3 synthetic tasks for quick testing:
+
+| # | Challenge | Rubric Items | Tests |
+|---|-----------|--------------|-------|
+| 1 | Memory Retention | 2 | Peanut allergy recall, name usage |
+| 2 | Preference Update | 3 | Pescatarian diet, preference correction |
+| 3 | Context Tracking | 2 | Presentation intro, climate change topic |
+
+**Usage:**
+```bash
+ng_collect_rollouts \
+ +agent_name=multichallenge_simple_agent \
+ +input_jsonl_fpath=resources_servers/multichallenge/data/example.jsonl \
+ +output_jsonl_fpath=/tmp/test_rollouts.jsonl
+```
+
+## Raw JSON Format
+
+Each task JSON file contains:
+
+```json
+{
+ "metadata": {
+ "taskId": 12345,
+ "topic": "Travel & Transportation",
+ "challenge": "Inference Memory",
+ "persona": "..."
+ },
+ "system": "Optional system prompt",
+ "messages": [
+ {"role": "user", "content": "..."},
+ {"role": "thinking", "content": "..."},
+ {"role": "assistant", "content": "..."},
+ {"role": "user", "content": "..."}
+ ],
+ "rubric": [
+ {
+ "question": "Did the model correctly remember X?",
+ "pass_criteria": "YES"
+ }
+ ],
+ "ground_truth_answer": "...",
+ "model_responses": [...] // Ignored by this environment
+}
+```
+
+## Preprocessed JSONL Format
+
+Each line in the JSONL file:
+
+```json
+{
+ "uuid": "12345",
+ "task_id": 12345,
+ "responses_create_params": {
+ "input": [
+ {"role": "system", "content": "..."},
+ {"role": "user", "content": "..."},
+ {"role": "assistant", "content": "..."},
+ {"role": "user", "content": "..."}
+ ]
+ },
+ "rubric": [...],
+ "context": "[USER]: ...\n\n[ASSISTANT]: ...",
+ "metadata": {...}
+}
+```
+
+**Key transformations:**
+- `thinking` role messages are **excluded** from `responses_create_params.input`
+- `context` is a pre-formatted string for the LLM judge (also excludes thinking)
+- `responses_create_params` wrapper is required by `ng_collect_rollouts`
+- `metadata` preserves full original data for reference
+
+## Regenerating JSONL Files
+
+If you modify the raw data or preprocessing logic:
+
+```bash
+python dataset_preprocess.py --data-dir ./data --splits advanced vanilla
+```
+
+**Options:**
+- `--data-dir`: Directory containing split subdirectories (default: `./data`)
+- `--output-dir`: Where to write JSONL files (default: same as data-dir)
+- `--splits`: Which splits to process (default: `advanced vanilla`)
+
+## Git Ignored Files
+
+The following are excluded from version control:
+- `advanced/` and `vanilla/` directories (raw data)
+- `advanced.jsonl` and `vanilla.jsonl` (preprocessed data)
+
+The `example.jsonl` file **is committed** for testing purposes.
diff --git a/resources_servers/multichallenge/data/example.jsonl b/resources_servers/multichallenge/data/example.jsonl
new file mode 100644
index 000000000..7593a7dcc
--- /dev/null
+++ b/resources_servers/multichallenge/data/example.jsonl
@@ -0,0 +1,5 @@
+{"uuid": "example_1", "task_id": 1, "agent_ref": {"type": "responses_api_agents", "name": "multichallenge_simple_agent"}, "responses_create_params": {"input": [{"role": "user", "content": "Hi! My name is Alex and I'm allergic to peanuts. Can you remember that?"}, {"role": "assistant", "content": "Of course, Alex! I've noted that you have a peanut allergy. I'll make sure to keep that in mind for any food-related suggestions. Is there anything else you'd like me to know about your dietary restrictions?"}, {"role": "user", "content": "No, that's all. Can you suggest a snack for me?"}]}, "rubric": [{"question": "Did the model correctly remember and avoid suggesting peanuts or peanut-containing snacks?", "pass_criteria": "YES"}, {"question": "Did the model address the user by their name (Alex)?", "pass_criteria": "YES"}], "context": "[USER]: Hi! My name is Alex and I'm allergic to peanuts. Can you remember that?\n\n[ASSISTANT]: Of course, Alex! I've noted that you have a peanut allergy. I'll make sure to keep that in mind for any food-related suggestions. Is there anything else you'd like me to know about your dietary restrictions?\n\n[USER]: No, that's all. Can you suggest a snack for me?", "metadata": {"topic": "Food & Dietary", "challenge": "Memory Retention"}}
+{"uuid": "example_2", "task_id": 2, "agent_ref": {"type": "responses_api_agents", "name": "multichallenge_simple_agent"}, "responses_create_params": {"input": [{"role": "system", "content": "You are a helpful travel assistant. Always be concise and practical."}, {"role": "user", "content": "I'm planning a trip to Japan next month. I don't eat meat."}, {"role": "assistant", "content": "Great choice! Japan has wonderful vegetarian options. I'll note that you're vegetarian for restaurant recommendations."}, {"role": "user", "content": "Actually, I eat fish, just not meat."}, {"role": "assistant", "content": "Understood! So you're pescatarian - you eat fish but not meat. That's perfect for Japan since seafood is a major part of the cuisine. I've updated my notes."}, {"role": "user", "content": "What are some must-try dishes for me in Tokyo?"}]}, "rubric": [{"question": "Did the model recommend dishes that include fish/seafood but exclude meat?", "pass_criteria": "YES"}, {"question": "Did the model correctly remember the user is pescatarian (not vegetarian)?", "pass_criteria": "YES"}, {"question": "Were the recommendations specific to Tokyo?", "pass_criteria": "YES"}], "context": "[SYSTEM]: You are a helpful travel assistant. Always be concise and practical.\n\n[USER]: I'm planning a trip to Japan next month. I don't eat meat.\n\n[ASSISTANT]: Great choice! Japan has wonderful vegetarian options. I'll note that you're vegetarian for restaurant recommendations.\n\n[USER]: Actually, I eat fish, just not meat.\n\n[ASSISTANT]: Understood! So you're pescatarian - you eat fish but not meat. That's perfect for Japan since seafood is a major part of the cuisine. I've updated my notes.\n\n[USER]: What are some must-try dishes for me in Tokyo?", "metadata": {"topic": "Travel", "challenge": "Preference Update"}}
+{"uuid": "example_3", "task_id": 3, "agent_ref": {"type": "responses_api_agents", "name": "multichallenge_simple_agent"}, "responses_create_params": {"input": [{"role": "user", "content": "I need help with my project. The deadline is Friday and today is Monday."}, {"role": "assistant", "content": "You have 4 days until your Friday deadline. Let's break this down - what's the project about and what have you completed so far?"}, {"role": "user", "content": "It's a presentation about climate change. I've done the research but haven't started the slides yet."}, {"role": "assistant", "content": "Good progress on research! For a 4-day timeline, I suggest: Day 1-2 (Mon-Tue): Create outline and draft slides. Day 3 (Wed): Add visuals and refine content. Day 4 (Thu): Practice and final polish. Would you like help with any specific section?"}, {"role": "user", "content": "Yes, can you help me structure the introduction?"}]}, "rubric": [{"question": "Did the model provide advice relevant to creating a presentation introduction?", "pass_criteria": "YES"}, {"question": "Did the model's response acknowledge the climate change topic?", "pass_criteria": "YES"}], "context": "[USER]: I need help with my project. The deadline is Friday and today is Monday.\n\n[ASSISTANT]: You have 4 days until your Friday deadline. Let's break this down - what's the project about and what have you completed so far?\n\n[USER]: It's a presentation about climate change. I've done the research but haven't started the slides yet.\n\n[ASSISTANT]: Good progress on research! For a 4-day timeline, I suggest: Day 1-2 (Mon-Tue): Create outline and draft slides. Day 3 (Wed): Add visuals and refine content. Day 4 (Thu): Practice and final polish. Would you like help with any specific section?\n\n[USER]: Yes, can you help me structure the introduction?", "metadata": {"topic": "Productivity", "challenge": "Context Tracking"}}
+{"uuid": "example_3", "task_id": 3, "agent_ref": {"type": "responses_api_agents", "name": "multichallenge_simple_agent"}, "responses_create_params": {"input": [{"role": "user", "content": "I need help with my project. The deadline is Friday and today is Monday."}, {"role": "assistant", "content": "You have 4 days until your Friday deadline. Let's break this down - what's the project about and what have you completed so far?"}, {"role": "user", "content": "It's a presentation about climate change. I've done the research but haven't started the slides yet."}, {"role": "assistant", "content": "Good progress on research! For a 4-day timeline, I suggest: Day 1-2 (Mon-Tue): Create outline and draft slides. Day 3 (Wed): Add visuals and refine content. Day 4 (Thu): Practice and final polish. Would you like help with any specific section?"}, {"role": "user", "content": "Yes, can you help me structure the introduction?"}]}, "rubric": [{"question": "Did the model provide advice relevant to creating a presentation introduction?", "pass_criteria": "YES"}, {"question": "Did the model's response acknowledge the climate change topic?", "pass_criteria": "YES"}], "context": "[USER]: I need help with my project. The deadline is Friday and today is Monday.\n\n[ASSISTANT]: You have 4 days until your Friday deadline. Let's break this down - what's the project about and what have you completed so far?\n\n[USER]: It's a presentation about climate change. I've done the research but haven't started the slides yet.\n\n[ASSISTANT]: Good progress on research! For a 4-day timeline, I suggest: Day 1-2 (Mon-Tue): Create outline and draft slides. Day 3 (Wed): Add visuals and refine content. Day 4 (Thu): Practice and final polish. Would you like help with any specific section?\n\n[USER]: Yes, can you help me structure the introduction?", "metadata": {"topic": "Productivity", "challenge": "Context Tracking"}}
+{"uuid": "example_3", "task_id": 3, "agent_ref": {"type": "responses_api_agents", "name": "multichallenge_simple_agent"}, "responses_create_params": {"input": [{"role": "user", "content": "I need help with my project. The deadline is Friday and today is Monday."}, {"role": "assistant", "content": "You have 4 days until your Friday deadline. Let's break this down - what's the project about and what have you completed so far?"}, {"role": "user", "content": "It's a presentation about climate change. I've done the research but haven't started the slides yet."}, {"role": "assistant", "content": "Good progress on research! For a 4-day timeline, I suggest: Day 1-2 (Mon-Tue): Create outline and draft slides. Day 3 (Wed): Add visuals and refine content. Day 4 (Thu): Practice and final polish. Would you like help with any specific section?"}, {"role": "user", "content": "Yes, can you help me structure the introduction?"}]}, "rubric": [{"question": "Did the model provide advice relevant to creating a presentation introduction?", "pass_criteria": "YES"}, {"question": "Did the model's response acknowledge the climate change topic?", "pass_criteria": "YES"}], "context": "[USER]: I need help with my project. The deadline is Friday and today is Monday.\n\n[ASSISTANT]: You have 4 days until your Friday deadline. Let's break this down - what's the project about and what have you completed so far?\n\n[USER]: It's a presentation about climate change. I've done the research but haven't started the slides yet.\n\n[ASSISTANT]: Good progress on research! For a 4-day timeline, I suggest: Day 1-2 (Mon-Tue): Create outline and draft slides. Day 3 (Wed): Add visuals and refine content. Day 4 (Thu): Practice and final polish. Would you like help with any specific section?\n\n[USER]: Yes, can you help me structure the introduction?", "metadata": {"topic": "Productivity", "challenge": "Context Tracking"}}
diff --git a/resources_servers/multichallenge/data/example_metrics.json b/resources_servers/multichallenge/data/example_metrics.json
new file mode 100644
index 000000000..8f60d2d9d
--- /dev/null
+++ b/resources_servers/multichallenge/data/example_metrics.json
@@ -0,0 +1,53 @@
+{
+ "name": "multichallenge_example",
+ "type": "example",
+ "jsonl_fpath": "resources_servers/multichallenge/data/example.jsonl",
+ "num_repeats": 1,
+ "gitlab_identifier": null,
+ "huggingface_identifier": null,
+ "license": "Apache 2.0",
+ "Number of examples": 5,
+ "Number of tools": {
+ "Total # non-null values": 0,
+ "Average": 0.0,
+ "Min": 0.0,
+ "Max": 0.0,
+ "Standard deviation": 0.0
+ },
+ "Json-dumped number of words (proxy for token count)": {
+ "Total # non-null values": 5,
+ "Average": 105.6,
+ "Min": 70.0,
+ "Max": 119.0,
+ "Standard deviation": 21.37
+ },
+ "Number of turns": {
+ "Total # non-null values": 5,
+ "Average": 2.8,
+ "Min": 2.0,
+ "Max": 3.0,
+ "Standard deviation": 0.447
+ },
+ "Temperature": {
+ "Total # non-null values": 0,
+ "Average": 0.0,
+ "Min": 0.0,
+ "Max": 0.0,
+ "Standard deviation": 0.0
+ },
+ "uuid": {
+ "unique_count": 3,
+ "total_count": 5
+ },
+ "task_id": {
+ "Total # non-null values": 5,
+ "Average": 2.4,
+ "Min": 1.0,
+ "Max": 3.0,
+ "Standard deviation": 0.894
+ },
+ "context": {
+ "unique_count": 3,
+ "total_count": 5
+ }
+}
\ No newline at end of file
diff --git a/resources_servers/multichallenge/data/example_rollouts.jsonl b/resources_servers/multichallenge/data/example_rollouts.jsonl
new file mode 100644
index 000000000..7593a7dcc
--- /dev/null
+++ b/resources_servers/multichallenge/data/example_rollouts.jsonl
@@ -0,0 +1,5 @@
+{"uuid": "example_1", "task_id": 1, "agent_ref": {"type": "responses_api_agents", "name": "multichallenge_simple_agent"}, "responses_create_params": {"input": [{"role": "user", "content": "Hi! My name is Alex and I'm allergic to peanuts. Can you remember that?"}, {"role": "assistant", "content": "Of course, Alex! I've noted that you have a peanut allergy. I'll make sure to keep that in mind for any food-related suggestions. Is there anything else you'd like me to know about your dietary restrictions?"}, {"role": "user", "content": "No, that's all. Can you suggest a snack for me?"}]}, "rubric": [{"question": "Did the model correctly remember and avoid suggesting peanuts or peanut-containing snacks?", "pass_criteria": "YES"}, {"question": "Did the model address the user by their name (Alex)?", "pass_criteria": "YES"}], "context": "[USER]: Hi! My name is Alex and I'm allergic to peanuts. Can you remember that?\n\n[ASSISTANT]: Of course, Alex! I've noted that you have a peanut allergy. I'll make sure to keep that in mind for any food-related suggestions. Is there anything else you'd like me to know about your dietary restrictions?\n\n[USER]: No, that's all. Can you suggest a snack for me?", "metadata": {"topic": "Food & Dietary", "challenge": "Memory Retention"}}
+{"uuid": "example_2", "task_id": 2, "agent_ref": {"type": "responses_api_agents", "name": "multichallenge_simple_agent"}, "responses_create_params": {"input": [{"role": "system", "content": "You are a helpful travel assistant. Always be concise and practical."}, {"role": "user", "content": "I'm planning a trip to Japan next month. I don't eat meat."}, {"role": "assistant", "content": "Great choice! Japan has wonderful vegetarian options. I'll note that you're vegetarian for restaurant recommendations."}, {"role": "user", "content": "Actually, I eat fish, just not meat."}, {"role": "assistant", "content": "Understood! So you're pescatarian - you eat fish but not meat. That's perfect for Japan since seafood is a major part of the cuisine. I've updated my notes."}, {"role": "user", "content": "What are some must-try dishes for me in Tokyo?"}]}, "rubric": [{"question": "Did the model recommend dishes that include fish/seafood but exclude meat?", "pass_criteria": "YES"}, {"question": "Did the model correctly remember the user is pescatarian (not vegetarian)?", "pass_criteria": "YES"}, {"question": "Were the recommendations specific to Tokyo?", "pass_criteria": "YES"}], "context": "[SYSTEM]: You are a helpful travel assistant. Always be concise and practical.\n\n[USER]: I'm planning a trip to Japan next month. I don't eat meat.\n\n[ASSISTANT]: Great choice! Japan has wonderful vegetarian options. I'll note that you're vegetarian for restaurant recommendations.\n\n[USER]: Actually, I eat fish, just not meat.\n\n[ASSISTANT]: Understood! So you're pescatarian - you eat fish but not meat. That's perfect for Japan since seafood is a major part of the cuisine. I've updated my notes.\n\n[USER]: What are some must-try dishes for me in Tokyo?", "metadata": {"topic": "Travel", "challenge": "Preference Update"}}
+{"uuid": "example_3", "task_id": 3, "agent_ref": {"type": "responses_api_agents", "name": "multichallenge_simple_agent"}, "responses_create_params": {"input": [{"role": "user", "content": "I need help with my project. The deadline is Friday and today is Monday."}, {"role": "assistant", "content": "You have 4 days until your Friday deadline. Let's break this down - what's the project about and what have you completed so far?"}, {"role": "user", "content": "It's a presentation about climate change. I've done the research but haven't started the slides yet."}, {"role": "assistant", "content": "Good progress on research! For a 4-day timeline, I suggest: Day 1-2 (Mon-Tue): Create outline and draft slides. Day 3 (Wed): Add visuals and refine content. Day 4 (Thu): Practice and final polish. Would you like help with any specific section?"}, {"role": "user", "content": "Yes, can you help me structure the introduction?"}]}, "rubric": [{"question": "Did the model provide advice relevant to creating a presentation introduction?", "pass_criteria": "YES"}, {"question": "Did the model's response acknowledge the climate change topic?", "pass_criteria": "YES"}], "context": "[USER]: I need help with my project. The deadline is Friday and today is Monday.\n\n[ASSISTANT]: You have 4 days until your Friday deadline. Let's break this down - what's the project about and what have you completed so far?\n\n[USER]: It's a presentation about climate change. I've done the research but haven't started the slides yet.\n\n[ASSISTANT]: Good progress on research! For a 4-day timeline, I suggest: Day 1-2 (Mon-Tue): Create outline and draft slides. Day 3 (Wed): Add visuals and refine content. Day 4 (Thu): Practice and final polish. Would you like help with any specific section?\n\n[USER]: Yes, can you help me structure the introduction?", "metadata": {"topic": "Productivity", "challenge": "Context Tracking"}}
+{"uuid": "example_3", "task_id": 3, "agent_ref": {"type": "responses_api_agents", "name": "multichallenge_simple_agent"}, "responses_create_params": {"input": [{"role": "user", "content": "I need help with my project. The deadline is Friday and today is Monday."}, {"role": "assistant", "content": "You have 4 days until your Friday deadline. Let's break this down - what's the project about and what have you completed so far?"}, {"role": "user", "content": "It's a presentation about climate change. I've done the research but haven't started the slides yet."}, {"role": "assistant", "content": "Good progress on research! For a 4-day timeline, I suggest: Day 1-2 (Mon-Tue): Create outline and draft slides. Day 3 (Wed): Add visuals and refine content. Day 4 (Thu): Practice and final polish. Would you like help with any specific section?"}, {"role": "user", "content": "Yes, can you help me structure the introduction?"}]}, "rubric": [{"question": "Did the model provide advice relevant to creating a presentation introduction?", "pass_criteria": "YES"}, {"question": "Did the model's response acknowledge the climate change topic?", "pass_criteria": "YES"}], "context": "[USER]: I need help with my project. The deadline is Friday and today is Monday.\n\n[ASSISTANT]: You have 4 days until your Friday deadline. Let's break this down - what's the project about and what have you completed so far?\n\n[USER]: It's a presentation about climate change. I've done the research but haven't started the slides yet.\n\n[ASSISTANT]: Good progress on research! For a 4-day timeline, I suggest: Day 1-2 (Mon-Tue): Create outline and draft slides. Day 3 (Wed): Add visuals and refine content. Day 4 (Thu): Practice and final polish. Would you like help with any specific section?\n\n[USER]: Yes, can you help me structure the introduction?", "metadata": {"topic": "Productivity", "challenge": "Context Tracking"}}
+{"uuid": "example_3", "task_id": 3, "agent_ref": {"type": "responses_api_agents", "name": "multichallenge_simple_agent"}, "responses_create_params": {"input": [{"role": "user", "content": "I need help with my project. The deadline is Friday and today is Monday."}, {"role": "assistant", "content": "You have 4 days until your Friday deadline. Let's break this down - what's the project about and what have you completed so far?"}, {"role": "user", "content": "It's a presentation about climate change. I've done the research but haven't started the slides yet."}, {"role": "assistant", "content": "Good progress on research! For a 4-day timeline, I suggest: Day 1-2 (Mon-Tue): Create outline and draft slides. Day 3 (Wed): Add visuals and refine content. Day 4 (Thu): Practice and final polish. Would you like help with any specific section?"}, {"role": "user", "content": "Yes, can you help me structure the introduction?"}]}, "rubric": [{"question": "Did the model provide advice relevant to creating a presentation introduction?", "pass_criteria": "YES"}, {"question": "Did the model's response acknowledge the climate change topic?", "pass_criteria": "YES"}], "context": "[USER]: I need help with my project. The deadline is Friday and today is Monday.\n\n[ASSISTANT]: You have 4 days until your Friday deadline. Let's break this down - what's the project about and what have you completed so far?\n\n[USER]: It's a presentation about climate change. I've done the research but haven't started the slides yet.\n\n[ASSISTANT]: Good progress on research! For a 4-day timeline, I suggest: Day 1-2 (Mon-Tue): Create outline and draft slides. Day 3 (Wed): Add visuals and refine content. Day 4 (Thu): Practice and final polish. Would you like help with any specific section?\n\n[USER]: Yes, can you help me structure the introduction?", "metadata": {"topic": "Productivity", "challenge": "Context Tracking"}}
diff --git a/resources_servers/multichallenge/dataset_preprocess.py b/resources_servers/multichallenge/dataset_preprocess.py
new file mode 100644
index 000000000..d653fb611
--- /dev/null
+++ b/resources_servers/multichallenge/dataset_preprocess.py
@@ -0,0 +1,246 @@
+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+#!/usr/bin/env python3
+"""
+Preprocesses MultiChallenge dataset to the format required by NeMo-Gym.
+
+Supports two input modes:
+1. JSONL mode (default): Reads from pre-compiled .jsonl files
+ Input: data/{split}.jsonl -> Output: data/{split}_preprocessed.jsonl
+
+2. JSON directory mode: Reads from individual JSON files
+ Input: data/{split}/*.json -> Output: data/{split}.jsonl
+
+Each output line contains the task data formatted for the simple_agent.
+"""
+
+import argparse
+import json
+from pathlib import Path
+from typing import Any
+
+
+# Hardcoded path for raw multichallenge data
+DEFAULT_RAW_DATA_DIR = Path("/lustre/fsw/portfolios/llmservice/users/mfathi/data/multichallenge")
+
+
+def build_input_messages(task: dict) -> list[dict]:
+ """
+ Build the input messages for the policy model from the task data.
+
+ Excludes 'thinking' role messages and the final user message (which the model should respond to).
+ """
+ messages = task.get("messages", [])
+ system_prompt = task.get("system", None)
+
+ input_msgs = []
+
+ # Add system message if present
+ if system_prompt:
+ input_msgs.append({"role": "system", "content": system_prompt})
+
+ # Add all messages (the agent will handle the conversation flow)
+ for msg in messages:
+ role = msg.get("role", "")
+ content = msg.get("content", "")
+
+ # Skip thinking messages - these shouldn't be sent to the policy model
+ if role == "thinking":
+ continue
+
+ input_msgs.append({"role": role, "content": content})
+
+ return input_msgs
+
+
+def build_context_string(task: dict) -> str:
+ """Build a readable context string from messages for the judge."""
+ messages = task.get("messages", [])
+ system_prompt = task.get("system", None)
+
+ context_parts = []
+
+ if system_prompt:
+ context_parts.append(f"[SYSTEM]: {system_prompt}")
+
+ for msg in messages:
+ role = msg.get("role", "unknown")
+ content = msg.get("content", "")
+
+ # Skip thinking messages
+ if role == "thinking":
+ continue
+
+ role_label = role.upper()
+ context_parts.append(f"[{role_label}]: {content}")
+
+ return "\n\n".join(context_parts)
+
+
+def process_task(task: dict, fallback_id: str = "unknown") -> dict[str, Any]:
+ """Process a single task dict into the preprocessed JSONL format."""
+ metadata = task.get("metadata", {})
+ task_id = metadata.get("taskId", fallback_id)
+
+ # Build the record for JSONL
+ record = {
+ "uuid": str(task_id),
+ "task_id": task_id,
+ # Agent reference - tells NeMo-Gym which agent to route this to
+ "agent_ref": {
+ "type": "responses_api_agents",
+ "name": "multichallenge_simple_agent",
+ },
+ # Input messages wrapped in responses_create_params (required by ng_collect_rollouts)
+ "responses_create_params": {
+ "input": build_input_messages(task),
+ },
+ # Rubric for evaluation
+ "rubric": task.get("rubric", []),
+ # Pre-built context string for the judge
+ "context": build_context_string(task),
+ # Full metadata
+ "metadata": {
+ **metadata,
+ "messages": task.get("messages", []),
+ "system": task.get("system", None),
+ "ground_truth_answer": task.get("ground_truth_answer", None),
+ },
+ }
+
+ return record
+
+
+def process_task_file(filepath: Path) -> dict[str, Any]:
+ """Process a single task JSON file into JSONL format."""
+ with open(filepath, "r", encoding="utf-8") as f:
+ task = json.load(f)
+ return process_task(task, fallback_id=filepath.stem)
+
+
+def process_jsonl_file(input_file: Path, output_file: Path) -> int:
+ """Process a JSONL file where each line is a task."""
+ count = 0
+ errors = 0
+
+ print(f"Processing JSONL file: {input_file}")
+
+ with open(input_file, "r", encoding="utf-8") as in_f, open(output_file, "w", encoding="utf-8") as out_f:
+ for line_num, line in enumerate(in_f, start=1):
+ line = line.strip()
+ if not line:
+ continue
+ try:
+ task = json.loads(line)
+ record = process_task(task, fallback_id=f"line_{line_num}")
+ out_f.write(json.dumps(record, ensure_ascii=False) + "\n")
+ count += 1
+ except json.JSONDecodeError as e:
+ print(f" Warning: Invalid JSON on line {line_num}: {e}")
+ errors += 1
+ except Exception as e:
+ print(f" Error processing line {line_num}: {e}")
+ errors += 1
+
+ print(f" Wrote {count} records to {output_file}" + (f" ({errors} errors)" if errors else ""))
+ return count
+
+
+def process_split_jsonl(data_dir: Path, split: str, output_dir: Path) -> int:
+ """Process a split from a JSONL file."""
+ input_file = data_dir / f"{split}.jsonl"
+ if not input_file.exists():
+ print(f"Warning: JSONL file not found: {input_file}")
+ return 0
+
+ output_file = output_dir / f"{split}.jsonl"
+ return process_jsonl_file(input_file, output_file)
+
+
+def process_split_json_dir(data_dir: Path, split: str, output_dir: Path) -> int:
+ """Process all JSON files in a split directory."""
+ split_dir = data_dir / split
+ if not split_dir.exists():
+ print(f"Warning: Split directory not found: {split_dir}")
+ return 0
+
+ output_file = output_dir / f"{split}.jsonl"
+ count = 0
+
+ json_files = sorted(split_dir.glob("*.json"))
+ print(f"Processing {len(json_files)} files from {split}...")
+
+ with open(output_file, "w", encoding="utf-8") as out_f:
+ for filepath in json_files:
+ try:
+ record = process_task_file(filepath)
+ out_f.write(json.dumps(record, ensure_ascii=False) + "\n")
+ count += 1
+ except Exception as e:
+ print(f"Error processing {filepath}: {e}")
+
+ print(f"Wrote {count} records to {output_file}")
+ return count
+
+
+def main():
+ parser = argparse.ArgumentParser(description="Convert MultiChallenge data to NeMo-Gym JSONL format")
+ parser.add_argument(
+ "--data-dir",
+ type=Path,
+ default=DEFAULT_RAW_DATA_DIR,
+ help=f"Directory containing the data (default: {DEFAULT_RAW_DATA_DIR})",
+ )
+ parser.add_argument(
+ "--output-dir",
+ type=Path,
+ default=Path(__file__).parent / "data",
+ help="Output directory for preprocessed JSONL files (default: ./data)",
+ )
+ parser.add_argument(
+ "--splits",
+ nargs="+",
+ default=["advanced", "vanilla"],
+ help="Splits to process (default: advanced vanilla)",
+ )
+ parser.add_argument(
+ "--mode",
+ choices=["jsonl", "json-dir"],
+ default="jsonl",
+ help="Input mode: 'jsonl' reads {split}.jsonl files, 'json-dir' reads {split}/*.json directories (default: jsonl)",
+ )
+ args = parser.parse_args()
+
+ args.output_dir.mkdir(parents=True, exist_ok=True)
+
+ print(f"Input directory: {args.data_dir}")
+ print(f"Output directory: {args.output_dir}")
+ print(f"Mode: {args.mode}")
+ print(f"Splits: {args.splits}")
+ print()
+
+ total = 0
+ for split in args.splits:
+ if args.mode == "jsonl":
+ total += process_split_jsonl(args.data_dir, split, args.output_dir)
+ else:
+ total += process_split_json_dir(args.data_dir, split, args.output_dir)
+
+ print(f"\nTotal: {total} records processed")
+
+
+if __name__ == "__main__":
+ main()
diff --git a/resources_servers/multichallenge/requirements.txt b/resources_servers/multichallenge/requirements.txt
new file mode 100644
index 000000000..00ed83213
--- /dev/null
+++ b/resources_servers/multichallenge/requirements.txt
@@ -0,0 +1 @@
+-e nemo-gym[dev] @ ../../
diff --git a/resources_servers/multichallenge/tests/__init__.py b/resources_servers/multichallenge/tests/__init__.py
new file mode 100644
index 000000000..1a8431c3e
--- /dev/null
+++ b/resources_servers/multichallenge/tests/__init__.py
@@ -0,0 +1,2 @@
+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
diff --git a/resources_servers/multichallenge/tests/test_multichallenge.py b/resources_servers/multichallenge/tests/test_multichallenge.py
new file mode 100644
index 000000000..37bbba7d7
--- /dev/null
+++ b/resources_servers/multichallenge/tests/test_multichallenge.py
@@ -0,0 +1,136 @@
+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import pytest
+
+from resources_servers.multichallenge.app import (
+ AggregationMode,
+ MultiChallengeConfig,
+ RubricEvaluation,
+ _build_context_from_messages,
+ _extract_verdict,
+)
+
+
+class TestMultiChallenge:
+ """Tests for MultiChallenge environment utilities."""
+
+ def test_extract_verdict_yes(self):
+ """Test extracting YES verdict."""
+ response = "After analysis, the model correctly addressed the user's allergy. [[YES]]"
+ verdict = _extract_verdict(response, "[[YES]]", "[[NO]]")
+ assert verdict == "YES"
+
+ def test_extract_verdict_no(self):
+ """Test extracting NO verdict."""
+ response = "The model failed to remember the allergy. [[NO]]"
+ verdict = _extract_verdict(response, "[[YES]]", "[[NO]]")
+ assert verdict == "NO"
+
+ def test_extract_verdict_fallback(self):
+ """Test fallback when no label present."""
+ response = "The model did well.\nYES"
+ verdict = _extract_verdict(response, "[[YES]]", "[[NO]]")
+ assert verdict == "YES"
+
+ def test_extract_verdict_last_wins(self):
+ """Test that last label wins when both present."""
+ response = "Initially [[YES]] but actually [[NO]]"
+ verdict = _extract_verdict(response, "[[YES]]", "[[NO]]")
+ assert verdict == "NO"
+
+ def test_build_context_excludes_thinking(self):
+ """Test that thinking messages are excluded from context."""
+ messages = [
+ {"role": "user", "content": "Hello"},
+ {"role": "thinking", "content": "Processing..."},
+ {"role": "assistant", "content": "Hi there!"},
+ ]
+ context = _build_context_from_messages(messages, exclude_thinking=True)
+ assert "Processing" not in context
+ assert "[USER]: Hello" in context
+ assert "[ASSISTANT]: Hi there!" in context
+
+ def test_build_context_includes_thinking(self):
+ """Test that thinking messages can be included."""
+ messages = [
+ {"role": "user", "content": "Hello"},
+ {"role": "thinking", "content": "Processing..."},
+ {"role": "assistant", "content": "Hi there!"},
+ ]
+ context = _build_context_from_messages(messages, exclude_thinking=False)
+ assert "[THINKING]: Processing" in context
+
+
+class TestAggregation:
+ """Tests for score aggregation."""
+
+ def create_evaluations(self, scores: list[float]) -> list[RubricEvaluation]:
+ """Create mock evaluations with given scores."""
+ return [
+ RubricEvaluation(
+ question=f"Q{i}",
+ pass_criteria="YES",
+ judge_prompt="...",
+ judge_response="...",
+ verdict="YES" if s >= 0.99 else "NO",
+ score=s,
+ weight=1.0,
+ )
+ for i, s in enumerate(scores)
+ ]
+
+ def test_aggregation_modes(self):
+ """Test various aggregation modes."""
+ from unittest.mock import MagicMock
+
+ from nemo_gym.config_types import ModelServerRef
+ from nemo_gym.openai_utils import NeMoGymResponseCreateParamsNonStreaming
+ from nemo_gym.server_utils import ServerClient
+ from resources_servers.multichallenge.app import MultiChallengeServer
+
+ config = MultiChallengeConfig(
+ host="",
+ port=0,
+ entrypoint="",
+ name="test",
+ judge_model_server=ModelServerRef(type="responses_api_models", name="test"),
+ judge_responses_create_params=NeMoGymResponseCreateParamsNonStreaming(input=[]),
+ )
+
+ # Create a proper mock that passes pydantic validation
+ mock_client = MagicMock(spec=ServerClient)
+ server = MultiChallengeServer.model_construct(config=config, server_client=mock_client)
+ evaluations = self.create_evaluations([1.0, 0.5, 0.0])
+
+ # Test MEAN
+ config.aggregation_mode = AggregationMode.MEAN
+ assert server._aggregate_scores(evaluations) == pytest.approx(0.5)
+
+ # Test MIN
+ config.aggregation_mode = AggregationMode.MIN
+ assert server._aggregate_scores(evaluations) == 0.0
+
+ # Test MAX
+ config.aggregation_mode = AggregationMode.MAX
+ assert server._aggregate_scores(evaluations) == 1.0
+
+ # Test ALL (only first passes)
+ config.aggregation_mode = AggregationMode.ALL
+ assert server._aggregate_scores(evaluations) == 0.0
+
+ # Test ANY (first passes)
+ config.aggregation_mode = AggregationMode.ANY
+ assert server._aggregate_scores(evaluations) == 1.0
diff --git a/resources_servers/swerl_gen/eval/singularity_utils.py b/resources_servers/swerl_gen/eval/singularity_utils.py
index 3a8ed8528..0a1252f32 100644
--- a/resources_servers/swerl_gen/eval/singularity_utils.py
+++ b/resources_servers/swerl_gen/eval/singularity_utils.py
@@ -200,7 +200,10 @@ def _run_instance(
# Using SPREAD scheduling so that Ray assigns tasks to as many distinct nodes as possible.
-@ray.remote(scheduling_strategy="SPREAD")
+@ray.remote(
+ scheduling_strategy="SPREAD",
+ runtime_env={"env_vars": {"PYTHONPATH": "/opt/nemo-rl/3rdparty/Gym-workspace/Gym"}},
+)
def compute_score(
extra_info_base64: str,
patch_str: str,