diff --git a/README.md b/README.md
index eb29dc65d..162ca6b80 100644
--- a/README.md
+++ b/README.md
@@ -160,7 +160,6 @@ Purpose: Training-ready environments with curated datasets.
 | Mini Swe Agent             | coding                | <a href='https://huggingface.co/datasets/SWE-Gym/SWE-Gym'>SWE-Gym</a>                                                                                          | A software development with mini-swe-agent orchestration                                             | Improve software development capabilities, like SWE-bench                | <a href='resources_servers/mini_swe_agent/configs/mini_swe_agent.yaml'>config</a>                         | ✓     | ✓          | MIT                                                       |
 | Instruction Following      | instruction_following | <a href='https://huggingface.co/datasets/nvidia/Nemotron-RL-instruction_following'>Nemotron-RL-instruction_following</a>                                       | Instruction following datasets targeting IFEval and IFBench style instruction following capabilities | Improve IFEval and IFBench                                               | <a href='resources_servers/instruction_following/configs/instruction_following.yaml'>config</a>           | ✓     | -          | Apache 2.0                                                |
 | Structured Outputs         | instruction_following | <a href='https://huggingface.co/datasets/nvidia/Nemotron-RL-instruction_following-structured_outputs'>Nemotron-RL-instruction_following-structured_outputs</a> | Check if responses are following structured output requirements in prompts                           | Improve instruction following capabilities                               | <a href='resources_servers/structured_outputs/configs/structured_outputs_json.yaml'>config</a>            | ✓     | ✓          | Apache 2.0                                                |
-| Equivalence Llm Judge      | knowledge             | <a href='https://huggingface.co/datasets/nvidia/Nemotron-RL-knowledge-openQA'>Nemotron-RL-knowledge-openQA</a>                                                 | Short answer questions with LLM-as-a-judge                                                           | Improve knowledge-related benchmarks like GPQA / HLE                     | <a href='resources_servers/equivalence_llm_judge/configs/equivalence_llm_judge.yaml'>config</a>           | ✓     | -          | Apache 2.0                                                |
 | Mcqa                       | knowledge             | <a href='https://huggingface.co/datasets/nvidia/Nemotron-RL-knowledge-mcqa'>Nemotron-RL-knowledge-mcqa</a>                                                     | Multi-choice question answering problems                                                             | Improve benchmarks like MMLU / GPQA / HLE                                | <a href='resources_servers/mcqa/configs/mcqa.yaml'>config</a>                                             | ✓     | -          | Apache 2.0                                                |
 | Math With Judge            | math                  | <a href='https://huggingface.co/datasets/nvidia/Nemotron-RL-math-OpenMathReasoning'>Nemotron-RL-math-OpenMathReasoning</a>                                     | Math dataset with math-verify and LLM-as-a-judge                                                     | Improve math capabilities including AIME 24 / 25                         | <a href='resources_servers/math_with_judge/configs/math_with_judge.yaml'>config</a>                       | ✓     | ✓          | Creative Commons Attribution 4.0 International            |
 | Math With Judge            | math                  | <a href='https://huggingface.co/datasets/nvidia/Nemotron-RL-math-stack_overflow'>Nemotron-RL-math-stack_overflow</a>                                           | -                                                                                                    | -                                                                        | <a href='resources_servers/math_with_judge/configs/math_stack_overflow.yaml'>config</a>                   | ✓     | ✓          | Creative Commons Attribution-ShareAlike 4.0 International |
diff --git a/resources_servers/calendar/app.py b/resources_servers/calendar/app.py
index c844d7f4c..f0afeab64 100644
--- a/resources_servers/calendar/app.py
+++ b/resources_servers/calendar/app.py
@@ -47,7 +47,27 @@ def setup_webserver(self) -> FastAPI:
         return app
 
     async def verify(self, body: CalendarVerifyRequest) -> BaseVerifyResponse:
-        assistant_response = body.response.output[-1].content[0].text
+        # Extract the assistant's text response from the last output item.
+        #
+        # For reasoning models (e.g., with deepseek_r1 reasoning_parser), the output
+        # structure is: [ReasoningItem, MessageItem] where:
+        #   - ReasoningItem: has .reasoning attribute (thinking/CoT tokens)
+        #   - MessageItem: has .content attribute (actual response text)
+        #
+        # The last item should be a MessageItem with .content, but if the model
+        # hit the token limit while still thinking, the last item will be a
+        # ReasoningItem without .content. In that case, we return reward=0.
+        assistant_response = ""
+        if body.response.output:
+            last_output = body.response.output[-1]
+            if hasattr(last_output, "content") and last_output.content:
+                assistant_response = last_output.content[0].text
+
+        # If no valid response (e.g., model only produced thinking tokens),
+        # return zero reward
+        if not assistant_response:
+            return BaseVerifyResponse(**body.model_dump(), reward=0)
+
         exp_cal_state = body.exp_cal_state
         try:
             reward, reason = grade_assistant_response(assistant_response, exp_cal_state)
diff --git a/resources_servers/equivalence_llm_judge/app.py b/resources_servers/equivalence_llm_judge/app.py
index 3ff412310..a8da06f93 100644
--- a/resources_servers/equivalence_llm_judge/app.py
+++ b/resources_servers/equivalence_llm_judge/app.py
@@ -21,7 +21,9 @@
 # limitations under the License.
 from __future__ import annotations
 
+import asyncio
 import re
+from contextlib import nullcontext
 from typing import Any, Optional
 
 from fastapi import FastAPI
@@ -59,8 +61,11 @@ class LLMJudgeResourcesServerConfig(BaseResourcesServerConfig):
     judge_model_server: ModelServerRef
     judge_responses_create_params: NeMoGymResponseCreateParamsNonStreaming
 
+    # Concurrency limit for judge endpoint requests. Set to None to disable limiting.
+    judge_endpoint_max_concurrency: Optional[int] = 64
+
     judge_system_message: Optional[str] = None
-    judge_prompt_template: str
+    judge_prompt_template_fpath: str = "prompt_templates/equivalence_llm_judge.txt"
     judge_equal_label: str = "[[A=B]]"
     judge_not_equal_label: str = "[[A!=B]]"
     # Optional regex to extract the question from the last user message.
@@ -250,6 +255,17 @@ class LLMJudgeResourcesServer(SimpleResourcesServer):
 
     config: LLMJudgeResourcesServerConfig
 
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+
+        if self.config.judge_endpoint_max_concurrency is not None:
+            self._judge_endpoint_max_concurrency = asyncio.Semaphore(value=self.config.judge_endpoint_max_concurrency)
+        else:
+            self._judge_endpoint_max_concurrency = nullcontext()
+
+        with open(self.config.judge_prompt_template_fpath, "r") as f:
+            self._judge_prompt_template = f.read().strip()
+
     def setup_webserver(self) -> FastAPI:
         app = super().setup_webserver()
         return app
@@ -420,7 +436,7 @@ async def _generate_judge_evaluation(
         not_equal_label = cfg.judge_not_equal_label
 
         responses_create_params = cfg.judge_responses_create_params.model_copy(deep=True)
-        prompt_template = cfg.judge_prompt_template
+        prompt_template = self._judge_prompt_template
         system_message = cfg.judge_system_message
 
         user_prompt = prompt_template.format(
@@ -433,12 +449,21 @@ async def _generate_judge_evaluation(
         msgs.append(NeMoGymEasyInputMessage(role="user", content=user_prompt))
         responses_create_params.input = msgs
 
-        response = await self.server_client.post(
-            server_name=cfg.judge_model_server.name,
-            url_path="/v1/responses",
-            json=responses_create_params,
-        )
-        judge_response = NeMoGymResponse.model_validate(await get_response_json(response))
+        async with self._judge_endpoint_max_concurrency:
+            try:
+                response = await self.server_client.post(
+                    server_name=cfg.judge_model_server.name,
+                    url_path="/v1/responses",
+                    json=responses_create_params,
+                )
+                judge_response = NeMoGymResponse.model_validate(await get_response_json(response))
+            except Exception as e:
+                print(
+                    f"DEBUG: LLMJudgeResourcesServer: judge model server HTTP POST error: {type(e).__name__} {e}",
+                    flush=True,
+                )
+                raise e
+
         eval_record = JudgeEvaluation(
             responses_create_params=responses_create_params,
             response=judge_response,
diff --git a/resources_servers/equivalence_llm_judge/configs/equivalence_llm_judge.yaml b/resources_servers/equivalence_llm_judge/configs/equivalence_llm_judge.yaml
index 55bfb96d2..d2275fe85 100644
--- a/resources_servers/equivalence_llm_judge/configs/equivalence_llm_judge.yaml
+++ b/resources_servers/equivalence_llm_judge/configs/equivalence_llm_judge.yaml
@@ -7,64 +7,9 @@ equivalence_llm_judge:
         name: policy_model
       judge_responses_create_params:
         input: []
-      judge_prompt_template: |-
-        ===== System role =====
-        You are a meticulous STEM grader. Compare a candidate answer to a GOLD reference for a scientific question and decide strict equivalence.
-
-        Grading priorities (in order):
-        1) Factual equivalence to GOLD (accept algebraically/formally equivalent formulations).
-        2) Completeness on required parts — the candidate must include the same core parts/subclaims as the GOLD.
-
-        Rules:
-        - Treat GOLD as authoritative for what counts as correct.
-        - If GOLD is a range or set, the candidate is equivalent only if it lies within that range or is a member of that set.
-        - For formulas/derivations, accept mathematically identical transformations (e.g., symbol reordering, factoring, equivalent identities).
-        - Multi-part: all essential parts must match for “equivalent”; otherwise they are not equivalent.
-        - Be concise. Do NOT reveal or rewrite the GOLD.
-
-        Show your reason why they are equivalent or not equivalent first and then provide the output.
-
-        Output (at the end after double newlines):
-        - If equivalent: [[A=B]] they are equivalent
-        - If not equivalent: [[A!=B]] they are not equivalent
-
-        ===== Example 1 (equivalent) =====
-        QUESTION:
-        State Avogadro’s constant (include units).
-
-        GOLD:
-        6.022 × 10^23 mol^-1
-
-        CANDIDATE:
-        6.022e23 per mole.
-
-        The candidate gives the same magnitude in scientific notation and the same “per mole” unit; no extra or missing qualifiers.
-
-        [[A=B]] they are equivalent
-
-        ===== Example 2 (not equivalent) =====
-        QUESTION:
-        State the first law of thermodynamics for a closed system and identify what each symbol represents.
-
-        GOLD:
-        ΔU = Q − W ; ΔU is change in internal energy, Q is heat added to the system, W is work done by the system.
-
-        CANDIDATE:
-        ΔU = Q + W ; ΔU is change in internal energy, Q is heat added to the system, W is work done by the system.
-
-        The candidate uses the opposite sign convention for work relative to the required relationship; one core part is incorrect, so the overall statement does not match.
-
-        [[A!=B]] they are not equivalent
-        
-        ===== Inputs =====
-        QUESTION:
-        {question}
-
-        GOLD:
-        {expected_answer}
-
-        CANDIDATE:
-        {generated_answer}
+      judge_prompt_template_fpath: prompt_templates/equivalence_llm_judge.txt
+      # Concurrency and rate limiting configuration
+      judge_endpoint_max_concurrency: 64 # Reduced from 64 to avoid rate limits
       judge_system_message: null
       judge_equal_label: "[[A=B]]"
       judge_not_equal_label: "[[A!=B]]"
@@ -135,7 +80,7 @@ equivalence_llm_judge_simple_agent:
         license: "TBD"
         jsonl_fpath: resources_servers/equivalence_llm_judge/data/example_openqa.jsonl
       - name: train
-        type: train
+        type: example
         license: Apache 2.0
         jsonl_fpath: resources_servers/equivalence_llm_judge/data/train.jsonl
         huggingface_identifier:
diff --git a/resources_servers/equivalence_llm_judge/configs/lc.yaml b/resources_servers/equivalence_llm_judge/configs/lc.yaml
new file mode 120000
index 000000000..b8218928a
--- /dev/null
+++ b/resources_servers/equivalence_llm_judge/configs/lc.yaml
@@ -0,0 +1 @@
+lc_judge.yaml
\ No newline at end of file
diff --git a/resources_servers/equivalence_llm_judge/configs/lc_judge.yaml b/resources_servers/equivalence_llm_judge/configs/lc_judge.yaml
new file mode 100644
index 000000000..db3843b50
--- /dev/null
+++ b/resources_servers/equivalence_llm_judge/configs/lc_judge.yaml
@@ -0,0 +1,35 @@
+lc_judge:
+  resources_servers:
+    equivalence_llm_judge:
+      entrypoint: app.py
+      judge_model_server:
+        type: responses_api_models
+        name: policy_model
+      judge_responses_create_params:
+        input: []
+      judge_prompt_template_fpath: prompt_templates/lc_judge.txt
+      judge_endpoint_max_concurrency: 64
+      judge_system_message: null
+      judge_equal_label: CORRECT
+      judge_not_equal_label: INCORRECT
+      check_twice_swap: false
+      reward_if_swap_fails: 0.0
+      question_extract_regex: ^QUESTION:\s*(.*)$
+      response_extract_regex: null
+      domain: knowledge
+      verified: false
+lc_judge_simple_agent:
+  responses_api_agents:
+    simple_agent:
+      entrypoint: app.py
+      resources_server:
+        type: resources_servers
+        name: lc_judge
+      model_server:
+        type: responses_api_models
+        name: policy_model
+      datasets:
+      - name: example
+        type: example
+        license: TBD
+        jsonl_fpath: resources_servers/equivalence_llm_judge/data/example.jsonl
diff --git a/resources_servers/equivalence_llm_judge/prompt_templates/equivalence_llm_judge.txt b/resources_servers/equivalence_llm_judge/prompt_templates/equivalence_llm_judge.txt
new file mode 100644
index 000000000..7a7de001b
--- /dev/null
+++ b/resources_servers/equivalence_llm_judge/prompt_templates/equivalence_llm_judge.txt
@@ -0,0 +1,58 @@
+
+===== System role =====
+You are a meticulous STEM grader. Compare a candidate answer to a GOLD reference for a scientific question and decide strict equivalence.
+
+Grading priorities (in order):
+1) Factual equivalence to GOLD (accept algebraically/formally equivalent formulations).
+2) Completeness on required parts — the candidate must include the same core parts/subclaims as the GOLD.
+
+Rules:
+- Treat GOLD as authoritative for what counts as correct.
+- If GOLD is a range or set, the candidate is equivalent only if it lies within that range or is a member of that set.
+- For formulas/derivations, accept mathematically identical transformations (e.g., symbol reordering, factoring, equivalent identities).
+- Multi-part: all essential parts must match for “equivalent”; otherwise they are not equivalent.
+- Be concise. Do NOT reveal or rewrite the GOLD.
+
+Show your reason why they are equivalent or not equivalent first and then provide the output.
+
+Output (at the end after double newlines):
+- If equivalent: [[A=B]] they are equivalent
+- If not equivalent: [[A!=B]] they are not equivalent
+
+===== Example 1 (equivalent) =====
+QUESTION:
+State Avogadro’s constant (include units).
+
+GOLD:
+6.022 × 10^23 mol^-1
+
+CANDIDATE:
+6.022e23 per mole.
+
+The candidate gives the same magnitude in scientific notation and the same “per mole” unit; no extra or missing qualifiers.
+
+[[A=B]] they are equivalent
+
+===== Example 2 (not equivalent) =====
+QUESTION:
+State the first law of thermodynamics for a closed system and identify what each symbol represents.
+
+GOLD:
+ΔU = Q − W ; ΔU is change in internal energy, Q is heat added to the system, W is work done by the system.
+
+CANDIDATE:
+ΔU = Q + W ; ΔU is change in internal energy, Q is heat added to the system, W is work done by the system.
+
+The candidate uses the opposite sign convention for work relative to the required relationship; one core part is incorrect, so the overall statement does not match.
+
+[[A!=B]] they are not equivalent
+
+===== Inputs =====
+QUESTION:
+{question}
+
+GOLD:
+{expected_answer}
+
+CANDIDATE:
+{generated_answer}
\ No newline at end of file
diff --git a/resources_servers/equivalence_llm_judge/prompt_templates/lc.txt b/resources_servers/equivalence_llm_judge/prompt_templates/lc.txt
new file mode 120000
index 000000000..b200b884c
--- /dev/null
+++ b/resources_servers/equivalence_llm_judge/prompt_templates/lc.txt
@@ -0,0 +1 @@
+lc_judge.txt
\ No newline at end of file
diff --git a/resources_servers/equivalence_llm_judge/prompt_templates/lc_judge.txt b/resources_servers/equivalence_llm_judge/prompt_templates/lc_judge.txt
new file mode 100644
index 000000000..bb44669d3
--- /dev/null
+++ b/resources_servers/equivalence_llm_judge/prompt_templates/lc_judge.txt
@@ -0,0 +1,9 @@
+
+Assess whether the following CANDIDATE ANSWER is CORRECT or INCORRECT.
+For the CANDIDATE ANSWER to be correct, it must be consistent with the OFFICIAL ANSWER.
+
+The question, for reference only: {question}
+The OFFICIAL ANSWER: {expected_answer}
+CANDIDATE ANSWER TO ASSESS: {generated_answer}
+
+Reply only with CORRECT or INCORRECT.
\ No newline at end of file
diff --git a/resources_servers/equivalence_llm_judge/tests/test_app.py b/resources_servers/equivalence_llm_judge/tests/test_app.py
index 83d875ab7..d824d2c9b 100644
--- a/resources_servers/equivalence_llm_judge/tests/test_app.py
+++ b/resources_servers/equivalence_llm_judge/tests/test_app.py
@@ -16,7 +16,6 @@
 from pathlib import Path
 from unittest.mock import AsyncMock, MagicMock
 
-from omegaconf import OmegaConf
 from pytest import approx, fixture
 
 from nemo_gym.config_types import ModelServerRef
@@ -38,11 +37,8 @@
 class TestApp:
     @fixture
     def config(self) -> LLMJudgeResourcesServerConfig:
-        # Load judge template from YAML so tests mirror runtime config
-        yaml_path = Path(__file__).resolve().parents[1] / "configs" / "equivalence_llm_judge.yaml"
-        yaml_cfg = OmegaConf.load(str(yaml_path))
-        judge_template: str = (
-            yaml_cfg.equivalence_llm_judge.resources_servers.equivalence_llm_judge.judge_prompt_template
+        judge_prompt_template_fpath = str(
+            Path(__file__).resolve().parents[1] / "prompt_templates/equivalence_llm_judge.txt"
         )
 
         cfg = LLMJudgeResourcesServerConfig(
@@ -51,7 +47,7 @@ def config(self) -> LLMJudgeResourcesServerConfig:
             entrypoint="",
             judge_model_server=ModelServerRef(type="responses_api_models", name="judge"),
             judge_responses_create_params=NeMoGymResponseCreateParamsNonStreaming(input=[]),
-            judge_prompt_template=judge_template,
+            judge_prompt_template_fpath=judge_prompt_template_fpath,
         )
         cfg.judge_equal_label = "[[A=B]]"
         cfg.judge_not_equal_label = "[[A!=B]]"
diff --git a/resources_servers/multichallenge/.gitignore b/resources_servers/multichallenge/.gitignore
new file mode 100644
index 000000000..ef5570e77
--- /dev/null
+++ b/resources_servers/multichallenge/.gitignore
@@ -0,0 +1,6 @@
+# Ignore data files (datasets should not be committed)
+data/advanced/
+data/vanilla/
+*.json
+*.jsonl
+!configs/*.yaml
diff --git a/resources_servers/multichallenge/README.md b/resources_servers/multichallenge/README.md
new file mode 100644
index 000000000..6e877d942
--- /dev/null
+++ b/resources_servers/multichallenge/README.md
@@ -0,0 +1,313 @@
+# MultiChallenge Environment
+
+Evaluates model responses on the **MultiChallenge** benchmark using an LLM judge. This benchmark assesses multi-turn conversation quality through rubric-based evaluation.
+
+## Quick Start
+
+```bash
+# 1. Run unit tests
+ng_test +entrypoint=resources_servers/multichallenge
+
+# 2. Start servers (in terminal 1)
+config_paths="resources_servers/multichallenge/configs/multichallenge.yaml,responses_api_models/vllm_model/configs/vllm_model.yaml"
+ng_run "+config_paths=[${config_paths}]"
+
+# 3. Collect rollouts on example data (in terminal 2)
+ng_collect_rollouts \
+  +agent_name=multichallenge_simple_agent \
+  +input_jsonl_fpath=resources_servers/multichallenge/data/example.jsonl \
+  +output_jsonl_fpath=/tmp/multichallenge_rollouts.jsonl
+```
+
+## Overview
+
+Each MultiChallenge task contains:
+- **Conversation context**: A multi-turn dialogue between user and assistant
+- **Rubric**: A set of yes/no questions evaluating the final response quality
+- **Metadata**: Task information including topic, challenge type, and persona
+
+The environment:
+1. Feeds the conversation context to the policy model
+2. Retrieves the final response (excluding thinking/reasoning blocks)
+3. Evaluates each rubric question using an LLM judge
+4. Aggregates scores using a configurable method (mean, min, all, etc.)
+
+## Data Preparation
+
+### Option A: Use Example Data Only (Quick Testing)
+
+The `data/example.jsonl` file contains 3 synthetic tasks ready to use:
+
+```bash
+# No preprocessing needed - just run
+ng_collect_rollouts \
+  +agent_name=multichallenge_simple_agent \
+  +input_jsonl_fpath=resources_servers/multichallenge/data/example.jsonl \
+  +output_jsonl_fpath=/tmp/test_rollouts.jsonl
+```
+
+### Option B: Full Dataset Setup
+
+> **Important**: Run the preprocessing script **before launching training jobs**. 
+> The preprocessed JSONL files must exist in `data/` for the training pipeline to work.
+
+1. **Preprocess to JSONL format**:
+   ```bash
+   # Run from the multichallenge directory
+   cd resources_servers/multichallenge
+   python dataset_preprocess.py
+   ```
+   
+   This reads from the raw data directory and outputs:
+   - `data/advanced.jsonl` (994 tasks)
+   - `data/vanilla.jsonl` (1023 tasks)
+
+   The script supports two input modes:
+   - `--mode jsonl` (default): Reads pre-compiled `{split}.jsonl` files
+   - `--mode json-dir`: Reads individual `{split}/*.json` files from directories
+
+   ```bash
+   # Custom input/output paths
+   python dataset_preprocess.py \
+     --data-dir /path/to/raw/data \
+     --output-dir ./data \
+     --splits advanced vanilla
+   ```
+
+2. **Run on full dataset**:
+   ```bash
+   ng_collect_rollouts \
+     +agent_name=multichallenge_simple_agent \
+     +input_jsonl_fpath=resources_servers/multichallenge/data/advanced.jsonl \
+     +output_jsonl_fpath=/tmp/advanced_rollouts.jsonl
+   ```
+
+## Testing
+
+### Unit Tests
+
+```bash
+# Run all unit tests
+ng_test +entrypoint=resources_servers/multichallenge
+
+# Or run directly with pytest for more detail
+cd resources_servers/multichallenge
+source .venv/bin/activate
+pytest -v
+```
+
+Tests cover:
+- Verdict extraction (`[[YES]]`/`[[NO]]`)
+- Context building (excluding thinking messages)
+- Score aggregation (mean, min, max, all, any, weighted)
+
+### End-to-End Sanity Test
+
+1. **Start servers**:
+   ```bash
+   config_paths="resources_servers/multichallenge/configs/multichallenge.yaml,responses_api_models/vllm_model/configs/vllm_model.yaml"
+   ng_run "+config_paths=[${config_paths}]"
+   ```
+
+2. **In another terminal, run on example data**:
+   ```bash
+   ng_collect_rollouts \
+     +agent_name=multichallenge_simple_agent \
+     +input_jsonl_fpath=resources_servers/multichallenge/data/example.jsonl \
+     +output_jsonl_fpath=/tmp/multichallenge_rollouts.jsonl \
+     +limit=3
+   ```
+
+3. **View results**:
+   ```bash
+   cat /tmp/multichallenge_rollouts.jsonl | python -c "
+   import json, sys
+   for line in sys.stdin:
+       d = json.loads(line)
+       print(f\"Reward: {d.get('reward')} | Passed: {d.get('num_passed')}/{d.get('num_total')}\")
+   "
+   ```
+
+## Configuration
+
+### Basic Setup
+
+```yaml
+multichallenge:
+  resources_servers:
+    multichallenge:
+      entrypoint: app.py
+      
+      # Judge model configuration
+      judge_model_server:
+        type: responses_api_models
+        name: policy_model  # or a dedicated judge model
+      
+      # Judge request parameters
+      judge_responses_create_params:
+        input: []
+        max_output_tokens: 512
+        temperature: 0.0
+      
+      # Score aggregation: mean | min | max | all | any | weighted
+      aggregation_mode: mean
+```
+
+### Aggregation Modes
+
+| Mode | Description |
+|------|-------------|
+| `mean` | Average of all rubric scores |
+| `min` | Minimum score (strictest) |
+| `max` | Maximum score (most lenient) |
+| `all` | All items must pass (binary: 0 or 1) |
+| `any` | Any item passes (binary: 0 or 1) |
+| `weighted` | Weighted average using rubric item weights |
+
+### Custom Judge Prompt
+
+You can customize the judge prompt template:
+
+```yaml
+judge_prompt_template: |-
+  You are evaluating whether a model's response meets a specific criterion.
+
+  CONVERSATION CONTEXT:
+  {context}
+
+  MODEL'S FINAL RESPONSE:
+  {response}
+
+  EVALUATION QUESTION:
+  {question}
+
+  EXPECTED ANSWER: {pass_criteria}
+
+  Respond with [[YES]] or [[NO]].
+```
+
+Placeholders:
+- `{context}`: Full conversation history
+- `{response}`: The model's final response
+- `{question}`: The rubric evaluation question
+- `{pass_criteria}`: Expected answer (usually "YES")
+
+## Data Format
+
+### Raw JSON Format (Input)
+
+Each task file contains:
+
+```json
+{
+  "metadata": {
+    "taskId": 12345,
+    "topic": "Education & Research",
+    "challenge": "Inference Memory"
+  },
+  "system": "Optional system prompt",
+  "messages": [
+    {"role": "user", "content": "..."},
+    {"role": "thinking", "content": "..."},
+    {"role": "assistant", "content": "..."}
+  ],
+  "rubric": [
+    {
+      "question": "Did the model correctly remember X?",
+      "pass_criteria": "YES"
+    }
+  ]
+}
+```
+
+### Preprocessed JSONL Format (Output)
+
+Each line contains:
+
+```json
+{
+  "uuid": "12345",
+  "task_id": 12345,
+  "responses_create_params": {
+    "input": [{"role": "user", "content": "..."}]
+  },
+  "rubric": [...],
+  "context": "[USER]: ...\n\n[ASSISTANT]: ...",
+  "metadata": {...}
+}
+```
+
+Key transformations:
+- `thinking` role messages are excluded from input
+- `context` is pre-formatted for the LLM judge
+- `responses_create_params` wraps input for `ng_collect_rollouts`
+
+## File Structure
+
+```
+multichallenge/
+├── app.py                   # Main server implementation
+├── dataset_preprocess.py    # JSON → JSONL converter
+├── requirements.txt         # Dependencies (-e nemo-gym[dev])
+├── README.md                # This file
+├── .gitignore               # Excludes data from git
+├── configs/
+│   └── multichallenge.yaml  # Server + agent configuration
+├── data/
+│   ├── example.jsonl        # Example data (3 tasks, committed)
+│   ├── advanced/            # Raw JSON files (ignored)
+│   ├── vanilla/             # Raw JSON files (ignored)
+│   ├── advanced.jsonl       # Preprocessed (generated, ignored)
+│   ├── vanilla.jsonl        # Preprocessed (generated, ignored)
+│   ├── .gitignore
+│   └── README.md
+└── tests/
+    ├── __init__.py
+    └── test_multichallenge.py
+```
+
+## API Endpoints
+
+- `POST /verify` - Evaluate a model response against the rubric
+- `POST /seed_session` - Initialize a new session
+
+### Verify Response
+
+```json
+{
+  "reward": 0.75,
+  "generated_response": "...",
+  "rubric_evaluations": [
+    {
+      "question": "...",
+      "pass_criteria": "YES",
+      "verdict": "YES",
+      "score": 1.0
+    }
+  ],
+  "num_passed": 3,
+  "num_total": 4,
+  "aggregation_mode": "mean"
+}
+```
+
+## Example Rubric Evaluation
+
+Given a conversation about travel planning where the user mentioned a seafood allergy:
+
+```json
+{
+  "question": "Did the model correctly remember that the user is allergic to seafood and avoid recommending seafood dishes?",
+  "pass_criteria": "YES"
+}
+```
+
+The LLM judge analyzes the model's response and returns `[[YES]]` or `[[NO]]`.
+
+---
+
+**Note**: The default raw data path is hardcoded in `dataset_preprocess.py`:
+```
+/lustre/fsw/portfolios/llmservice/users/mfathi/data/multichallenge
+```
+Update `DEFAULT_RAW_DATA_DIR` in the script or use `--data-dir` to specify a different location.
diff --git a/resources_servers/multichallenge/app.py b/resources_servers/multichallenge/app.py
new file mode 100644
index 000000000..cb039a1a4
--- /dev/null
+++ b/resources_servers/multichallenge/app.py
@@ -0,0 +1,382 @@
+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+MultiChallenge Environment Resources Server.
+
+Evaluates model responses on the MultiChallenge benchmark using an LLM judge.
+Each task contains a conversation context and a rubric of yes/no questions
+that assess the quality of the final assistant response.
+
+This environment:
+1. Loads tasks from configurable splits (e.g., "advanced", "vanilla")
+2. Feeds conversation context to the policy model
+3. Retrieves the final response (excluding thinking parts)
+4. Evaluates against each rubric entry using an LLM judge
+5. Aggregates scores using a configurable method
+"""
+
+from __future__ import annotations
+
+import re
+from enum import Enum
+from typing import Any, List, Optional
+
+from fastapi import FastAPI
+from pydantic import BaseModel, ConfigDict, Field
+
+from nemo_gym.base_resources_server import (
+    BaseResourcesServerConfig,
+    BaseRunRequest,
+    BaseVerifyRequest,
+    BaseVerifyResponse,
+    SimpleResourcesServer,
+)
+from nemo_gym.config_types import ModelServerRef
+from nemo_gym.openai_utils import (
+    NeMoGymEasyInputMessage,
+    NeMoGymResponse,
+    NeMoGymResponseCreateParamsNonStreaming,
+)
+
+
+class AggregationMode(str, Enum):
+    """How to aggregate rubric scores into a final reward."""
+
+    # Average of all rubric scores
+    MEAN = "mean"
+    # Minimum score across all rubric items (strict)
+    MIN = "min"
+    # Maximum score across all rubric items (lenient)
+    MAX = "max"
+    # All rubric items must pass (product of binary scores)
+    ALL = "all"
+    # Any rubric item passes (max of binary scores)
+    ANY = "any"
+    # Weighted average (requires weights in rubric items)
+    WEIGHTED = "weighted"
+
+
+class RubricEvaluation(BaseModel):
+    """Result of evaluating a single rubric item."""
+
+    question: str
+    pass_criteria: str
+    judge_prompt: str
+    judge_response: str
+    verdict: str  # "YES" or "NO"
+    score: float  # 1.0 for pass, 0.0 for fail
+    weight: float = 1.0
+
+
+class MultiChallengeConfig(BaseResourcesServerConfig):
+    """Configuration for the MultiChallenge environment server."""
+
+    name: str = "multichallenge"
+
+    # Reference to the judge model server
+    judge_model_server: ModelServerRef = Field(description="Reference to the model server used as the LLM judge")
+
+    # Parameters for judge requests
+    judge_responses_create_params: NeMoGymResponseCreateParamsNonStreaming = Field(
+        description="Base parameters for judge model requests"
+    )
+
+    # Aggregation mode for combining rubric scores
+    aggregation_mode: AggregationMode = Field(
+        default=AggregationMode.MEAN, description="How to aggregate scores from multiple rubric items"
+    )
+
+    # Template for the judge prompt
+    judge_prompt_template: str = Field(
+        default="""You are evaluating whether a model's response meets a specific criterion.
+
+CONVERSATION CONTEXT:
+{context}
+
+MODEL'S FINAL RESPONSE:
+{response}
+
+EVALUATION QUESTION:
+{question}
+
+EXPECTED ANSWER: {pass_criteria}
+
+Does the model's response satisfy the criterion described in the evaluation question?
+Analyze carefully, then respond with exactly [[YES]] or [[NO]] on the last line.""",
+        description="Template for the judge evaluation prompt",
+    )
+
+    # System message for the judge
+    judge_system_message: Optional[str] = Field(
+        default="You are a precise evaluator. Assess responses objectively based on the given criteria.",
+        description="Optional system message for the judge",
+    )
+
+    # Whether to run rubric evaluations in parallel
+    parallel_evaluation: bool = Field(default=True, description="Whether to evaluate rubric items in parallel")
+
+    # Labels for verdict extraction
+    yes_label: str = Field(default="[[YES]]", description="Label indicating YES verdict")
+    no_label: str = Field(default="[[NO]]", description="Label indicating NO verdict")
+
+
+class MultiChallengeRunRequest(BaseRunRequest):
+    """Run request payload for MultiChallenge tasks."""
+
+    model_config = ConfigDict(extra="allow")
+
+    uuid: Optional[str | int] = None
+    task_id: Optional[int] = None
+    rubric: Optional[List[dict]] = None
+    context: Optional[str] = None
+    metadata: Optional[dict[str, Any]] = None
+
+
+class MultiChallengeVerifyRequest(MultiChallengeRunRequest, BaseVerifyRequest):
+    """Verify request that includes the model's response."""
+
+    pass
+
+
+class MultiChallengeVerifyResponse(BaseVerifyResponse):
+    """Response with detailed rubric evaluations."""
+
+    model_config = ConfigDict(extra="allow")
+
+    context: str
+    generated_response: str
+    rubric_evaluations: List[RubricEvaluation]
+    aggregation_mode: str
+    num_passed: int
+    num_total: int
+
+
+def _extract_text_from_response(response: NeMoGymResponse, exclude_thinking: bool = True) -> str:
+    """Extract text content from the last assistant message, optionally excluding thinking."""
+    for output in reversed(response.output):
+        if getattr(output, "type", None) == "message" and getattr(output, "role", None) == "assistant":
+            content = getattr(output, "content", None)
+            if isinstance(content, list):
+                texts = []
+                for c in content:
+                    text = getattr(c, "text", None)
+                    if isinstance(text, str):
+                        texts.append(text)
+                full_text = "\n".join(texts).strip()
+            elif isinstance(content, str):
+                full_text = content.strip()
+            else:
+                continue
+
+            if exclude_thinking:
+                # Remove <think>...</think> blocks
+                full_text = re.sub(r"<think>.*?</think>", "", full_text, flags=re.DOTALL)
+                # Also remove <thinking>...</thinking> blocks
+                full_text = re.sub(r"<thinking>.*?</thinking>", "", full_text, flags=re.DOTALL)
+
+            return full_text.strip()
+    return ""
+
+
+def _build_context_from_messages(messages: List[dict], exclude_thinking: bool = True) -> str:
+    """Build a readable context string from the message history."""
+    context_parts = []
+    for msg in messages:
+        role = msg.get("role", "unknown")
+        content = msg.get("content", "")
+
+        # Skip thinking messages
+        if exclude_thinking and role == "thinking":
+            continue
+
+        role_label = role.upper()
+        context_parts.append(f"[{role_label}]: {content}")
+
+    return "\n\n".join(context_parts)
+
+
+def _extract_verdict(response_text: str, yes_label: str, no_label: str) -> str:
+    """Extract YES/NO verdict from judge response."""
+    # Look for the labels in the response
+    yes_pos = response_text.rfind(yes_label)
+    no_pos = response_text.rfind(no_label)
+
+    if yes_pos < 0 and no_pos < 0:
+        # Fallback: look for plain YES/NO at end of response
+        lines = response_text.strip().split("\n")
+        last_line = lines[-1].strip().upper() if lines else ""
+        if "YES" in last_line:
+            return "YES"
+        elif "NO" in last_line:
+            return "NO"
+        return "NO"  # Default to NO if unclear
+
+    # Return whichever appears last (most authoritative)
+    if yes_pos > no_pos:
+        return "YES"
+    return "NO"
+
+
+class MultiChallengeServer(SimpleResourcesServer):
+    """MultiChallenge evaluation server."""
+
+    config: MultiChallengeConfig
+
+    def setup_webserver(self) -> FastAPI:
+        app = super().setup_webserver()
+        return app
+
+    async def verify(self, body: MultiChallengeVerifyRequest) -> MultiChallengeVerifyResponse:
+        """Verify model response against the rubric using LLM judge."""
+
+        # Extract the generated response (without thinking)
+        generated_response = _extract_text_from_response(body.response, exclude_thinking=True)
+
+        # Get context from the request or build from messages if available
+        context = body.context or ""
+        if not context and body.metadata and "messages" in body.metadata:
+            context = _build_context_from_messages(body.metadata["messages"])
+
+        # Get rubric from request
+        rubric = body.rubric or []
+        if not rubric and body.metadata and "rubric" in body.metadata:
+            rubric = body.metadata["rubric"]
+
+        # Evaluate each rubric item
+        if self.config.parallel_evaluation and len(rubric) > 1:
+            import asyncio
+
+            evaluations = await asyncio.gather(
+                *[self._evaluate_rubric_item(item, context, generated_response) for item in rubric]
+            )
+        else:
+            evaluations = []
+            for item in rubric:
+                eval_result = await self._evaluate_rubric_item(item, context, generated_response)
+                evaluations.append(eval_result)
+
+        # Aggregate scores
+        reward = self._aggregate_scores(evaluations)
+        num_passed = sum(1 for e in evaluations if e.score >= 0.99)
+
+        # Build response
+        payload = body.model_dump()
+        payload.pop("context", None)
+        payload.pop("rubric", None)
+
+        return MultiChallengeVerifyResponse(
+            **payload,
+            reward=reward,
+            context=context,
+            generated_response=generated_response,
+            rubric_evaluations=evaluations,
+            aggregation_mode=self.config.aggregation_mode.value,
+            num_passed=num_passed,
+            num_total=len(evaluations),
+        )
+
+    async def _evaluate_rubric_item(self, item: dict, context: str, response: str) -> RubricEvaluation:
+        """Evaluate a single rubric item using the LLM judge."""
+
+        question = item.get("question", "")
+        pass_criteria = item.get("pass_criteria", "YES")
+        weight = item.get("weight", 1.0)
+
+        # Format the judge prompt
+        judge_prompt = self.config.judge_prompt_template.format(
+            context=context,
+            response=response,
+            question=question,
+            pass_criteria=pass_criteria,
+        )
+
+        # Build messages for judge
+        msgs: List[NeMoGymEasyInputMessage] = []
+        if self.config.judge_system_message:
+            msgs.append(NeMoGymEasyInputMessage(role="system", content=self.config.judge_system_message))
+        msgs.append(NeMoGymEasyInputMessage(role="user", content=judge_prompt))
+
+        # Create request parameters
+        request_params = self.config.judge_responses_create_params.model_copy(deep=True)
+        request_params.input = msgs
+
+        # Call judge model
+        response_obj = await self.server_client.post(
+            server_name=self.config.judge_model_server.name,
+            url_path="/v1/responses",
+            json=request_params,
+        )
+        judge_response = NeMoGymResponse.model_validate(await response_obj.json())
+        judge_text = _extract_text_from_response(judge_response, exclude_thinking=True)
+
+        # Extract verdict
+        verdict = _extract_verdict(judge_text, self.config.yes_label, self.config.no_label)
+
+        # Score based on whether verdict matches expected criteria
+        if pass_criteria.upper() == "YES":
+            score = 1.0 if verdict == "YES" else 0.0
+        elif pass_criteria.upper() == "NO":
+            score = 1.0 if verdict == "NO" else 0.0
+        else:
+            # For other criteria, treat YES as success
+            score = 1.0 if verdict == "YES" else 0.0
+
+        return RubricEvaluation(
+            question=question,
+            pass_criteria=pass_criteria,
+            judge_prompt=judge_prompt,
+            judge_response=judge_text,
+            verdict=verdict,
+            score=score,
+            weight=weight,
+        )
+
+    def _aggregate_scores(self, evaluations: List[RubricEvaluation]) -> float:
+        """Aggregate rubric scores into final reward."""
+        if not evaluations:
+            return 0.0
+
+        scores = [e.score for e in evaluations]
+        weights = [e.weight for e in evaluations]
+
+        mode = self.config.aggregation_mode
+
+        if mode == AggregationMode.MEAN:
+            return sum(scores) / len(scores)
+
+        elif mode == AggregationMode.MIN:
+            return min(scores)
+
+        elif mode == AggregationMode.MAX:
+            return max(scores)
+
+        elif mode == AggregationMode.ALL:
+            return 1.0 if all(s >= 0.99 for s in scores) else 0.0
+
+        elif mode == AggregationMode.ANY:
+            return 1.0 if any(s >= 0.99 for s in scores) else 0.0
+
+        elif mode == AggregationMode.WEIGHTED:
+            total_weight = sum(weights)
+            if total_weight == 0:
+                return 0.0
+            weighted_sum = sum(s * w for s, w in zip(scores, weights))
+            return weighted_sum / total_weight
+
+        return 0.0
+
+
+if __name__ == "__main__":
+    MultiChallengeServer.run_webserver()
diff --git a/resources_servers/multichallenge/configs/multichallenge.yaml b/resources_servers/multichallenge/configs/multichallenge.yaml
new file mode 100644
index 000000000..7cb7ec74c
--- /dev/null
+++ b/resources_servers/multichallenge/configs/multichallenge.yaml
@@ -0,0 +1,72 @@
+multichallenge:
+  resources_servers:
+    multichallenge:
+      entrypoint: app.py
+      judge_model_server:
+        type: responses_api_models
+        name: policy_model
+      judge_responses_create_params:
+        input: []
+        max_output_tokens: 8192
+        temperature: 0.7
+        top_p: 0.8
+      aggregation_mode: mean
+      parallel_evaluation: true
+      judge_system_message: You are a precise evaluator. Assess responses objectively
+        based on the given criteria. Analyze the response carefully against the evaluation
+        question.
+      judge_prompt_template: 'You are evaluating whether a model''s response meets
+        a specific criterion.
+
+
+        CONVERSATION CONTEXT:
+
+        {context}
+
+
+        MODEL''S FINAL RESPONSE:
+
+        {response}
+
+
+        EVALUATION QUESTION:
+
+        {question}
+
+
+        EXPECTED ANSWER: {pass_criteria}
+
+
+        Does the model''s response satisfy the criterion described in the evaluation
+        question?
+
+        Think step by step, then respond with exactly [[YES]] or [[NO]] on the last
+        line.'
+      yes_label: '[[YES]]'
+      no_label: '[[NO]]'
+      domain: knowledge
+      description: MultiChallenge benchmark evaluation with LLM judge
+      verified: false
+multichallenge_simple_agent:
+  responses_api_agents:
+    simple_agent:
+      entrypoint: app.py
+      resources_server:
+        type: resources_servers
+        name: multichallenge
+      model_server:
+        type: responses_api_models
+        name: policy_model
+      datasets:
+      - name: multichallenge_example
+        type: example
+        license: Apache 2.0
+        jsonl_fpath: resources_servers/multichallenge/data/example.jsonl
+      - name: multichallenge_advanced
+        type: train
+        license: TBD
+        jsonl_fpath: resources_servers/multichallenge/data/advanced.jsonl
+      - name: multichallenge_vanilla
+        type: train
+        license: TBD
+        jsonl_fpath: resources_servers/multichallenge/data/vanilla.jsonl
diff --git a/resources_servers/multichallenge/configs/multichallenge_nrl.yaml b/resources_servers/multichallenge/configs/multichallenge_nrl.yaml
new file mode 100644
index 000000000..7b67aa0f0
--- /dev/null
+++ b/resources_servers/multichallenge/configs/multichallenge_nrl.yaml
@@ -0,0 +1,72 @@
+multichallenge:
+  resources_servers:
+    multichallenge:
+      entrypoint: app.py
+      judge_model_server:
+        type: responses_api_models
+        name: policy_model
+      judge_responses_create_params:
+        input: []
+        max_output_tokens: 512
+        temperature: 1.0
+        top_p: 1.0
+      aggregation_mode: mean
+      parallel_evaluation: true
+      judge_system_message: You are a precise evaluator. Assess responses objectively
+        based on the given criteria. Analyze the response carefully against the evaluation
+        question.
+      judge_prompt_template: 'You are evaluating whether a model''s response meets
+        a specific criterion.
+
+
+        CONVERSATION CONTEXT:
+
+        {context}
+
+
+        MODEL''S FINAL RESPONSE:
+
+        {response}
+
+
+        EVALUATION QUESTION:
+
+        {question}
+
+
+        EXPECTED ANSWER: {pass_criteria}
+
+
+        Does the model''s response satisfy the criterion described in the evaluation
+        question?
+
+        Think step by step, then respond with exactly [[YES]] or [[NO]] on the last
+        line.'
+      yes_label: '[[YES]]'
+      no_label: '[[NO]]'
+      domain: knowledge
+      description: MultiChallenge benchmark evaluation with LLM judge
+      verified: false
+multichallenge_simple_agent:
+  responses_api_agents:
+    simple_agent:
+      entrypoint: app.py
+      resources_server:
+        type: resources_servers
+        name: multichallenge
+      model_server:
+        type: responses_api_models
+        name: policy_model
+      datasets:
+      - name: multichallenge_example
+        type: example
+        license: Apache 2.0
+        jsonl_fpath: resources_servers/multichallenge/data/example.jsonl
+      - name: multichallenge_advanced
+        type: train
+        license: TBD
+        jsonl_fpath: resources_servers/multichallenge/data/advanced.jsonl
+      - name: multichallenge_vanilla
+        type: train
+        license: TBD
+        jsonl_fpath: resources_servers/multichallenge/data/vanilla.jsonl
diff --git a/resources_servers/multichallenge/data/.gitignore b/resources_servers/multichallenge/data/.gitignore
new file mode 100644
index 000000000..44c53a589
--- /dev/null
+++ b/resources_servers/multichallenge/data/.gitignore
@@ -0,0 +1,17 @@
+# Ignore all data files in this directory
+# The MultiChallenge dataset should be copied here but not committed to git
+
+# Ignore raw data directories
+advanced/
+vanilla/
+
+# Ignore preprocessed JSONL files (generated from JSON files)
+advanced.jsonl
+vanilla.jsonl
+
+# Keep these files
+!.gitignore
+!README.md
+!example.jsonl
+!example_rollouts.jsonl
+!example_metrics.json
diff --git a/resources_servers/multichallenge/data/README.md b/resources_servers/multichallenge/data/README.md
new file mode 100644
index 000000000..d8db78eb6
--- /dev/null
+++ b/resources_servers/multichallenge/data/README.md
@@ -0,0 +1,131 @@
+# MultiChallenge Data Directory
+
+This directory contains the MultiChallenge benchmark dataset.
+
+## Quick Start
+
+```bash
+# Option A: Use example data only (no setup needed)
+# The example.jsonl file is ready to use for testing
+
+# Option B: Full dataset setup
+# 1. Copy raw data
+cp -r /path/to/multichallenge/advanced ./advanced
+cp -r /path/to/multichallenge/vanilla ./vanilla
+
+# 2. Preprocess to JSONL (run from parent directory)
+cd ..
+python dataset_preprocess.py
+```
+
+## Directory Structure
+
+```
+data/
+├── example.jsonl       # Example dataset (3 tasks, committed to git)
+├── advanced/           # Raw JSON task files (113 tasks, ignored)
+│   └── *.json
+├── vanilla/            # Raw JSON task files (111 tasks, ignored)
+│   └── *.json
+├── advanced.jsonl      # Preprocessed dataset (generated, ignored)
+├── vanilla.jsonl       # Preprocessed dataset (generated, ignored)
+├── .gitignore          # Excludes data files from git
+└── README.md           # This file
+```
+
+## Example Dataset
+
+The `example.jsonl` file contains 3 synthetic tasks for quick testing:
+
+| # | Challenge | Rubric Items | Tests |
+|---|-----------|--------------|-------|
+| 1 | Memory Retention | 2 | Peanut allergy recall, name usage |
+| 2 | Preference Update | 3 | Pescatarian diet, preference correction |
+| 3 | Context Tracking | 2 | Presentation intro, climate change topic |
+
+**Usage:**
+```bash
+ng_collect_rollouts \
+  +agent_name=multichallenge_simple_agent \
+  +input_jsonl_fpath=resources_servers/multichallenge/data/example.jsonl \
+  +output_jsonl_fpath=/tmp/test_rollouts.jsonl
+```
+
+## Raw JSON Format
+
+Each task JSON file contains:
+
+```json
+{
+  "metadata": {
+    "taskId": 12345,
+    "topic": "Travel & Transportation",
+    "challenge": "Inference Memory",
+    "persona": "..."
+  },
+  "system": "Optional system prompt",
+  "messages": [
+    {"role": "user", "content": "..."},
+    {"role": "thinking", "content": "..."},
+    {"role": "assistant", "content": "..."},
+    {"role": "user", "content": "..."}
+  ],
+  "rubric": [
+    {
+      "question": "Did the model correctly remember X?",
+      "pass_criteria": "YES"
+    }
+  ],
+  "ground_truth_answer": "...",
+  "model_responses": [...]  // Ignored by this environment
+}
+```
+
+## Preprocessed JSONL Format
+
+Each line in the JSONL file:
+
+```json
+{
+  "uuid": "12345",
+  "task_id": 12345,
+  "responses_create_params": {
+    "input": [
+      {"role": "system", "content": "..."},
+      {"role": "user", "content": "..."},
+      {"role": "assistant", "content": "..."},
+      {"role": "user", "content": "..."}
+    ]
+  },
+  "rubric": [...],
+  "context": "[USER]: ...\n\n[ASSISTANT]: ...",
+  "metadata": {...}
+}
+```
+
+**Key transformations:**
+- `thinking` role messages are **excluded** from `responses_create_params.input`
+- `context` is a pre-formatted string for the LLM judge (also excludes thinking)
+- `responses_create_params` wrapper is required by `ng_collect_rollouts`
+- `metadata` preserves full original data for reference
+
+## Regenerating JSONL Files
+
+If you modify the raw data or preprocessing logic:
+
+```bash
+python dataset_preprocess.py --data-dir ./data --splits advanced vanilla
+```
+
+**Options:**
+- `--data-dir`: Directory containing split subdirectories (default: `./data`)
+- `--output-dir`: Where to write JSONL files (default: same as data-dir)
+- `--splits`: Which splits to process (default: `advanced vanilla`)
+
+## Git Ignored Files
+
+The following are excluded from version control:
+- `advanced/` and `vanilla/` directories (raw data)
+- `advanced.jsonl` and `vanilla.jsonl` (preprocessed data)
+
+The `example.jsonl` file **is committed** for testing purposes.
diff --git a/resources_servers/multichallenge/data/example.jsonl b/resources_servers/multichallenge/data/example.jsonl
new file mode 100644
index 000000000..7593a7dcc
--- /dev/null
+++ b/resources_servers/multichallenge/data/example.jsonl
@@ -0,0 +1,5 @@
+{"uuid": "example_1", "task_id": 1, "agent_ref": {"type": "responses_api_agents", "name": "multichallenge_simple_agent"}, "responses_create_params": {"input": [{"role": "user", "content": "Hi! My name is Alex and I'm allergic to peanuts. Can you remember that?"}, {"role": "assistant", "content": "Of course, Alex! I've noted that you have a peanut allergy. I'll make sure to keep that in mind for any food-related suggestions. Is there anything else you'd like me to know about your dietary restrictions?"}, {"role": "user", "content": "No, that's all. Can you suggest a snack for me?"}]}, "rubric": [{"question": "Did the model correctly remember and avoid suggesting peanuts or peanut-containing snacks?", "pass_criteria": "YES"}, {"question": "Did the model address the user by their name (Alex)?", "pass_criteria": "YES"}], "context": "[USER]: Hi! My name is Alex and I'm allergic to peanuts. Can you remember that?\n\n[ASSISTANT]: Of course, Alex! I've noted that you have a peanut allergy. I'll make sure to keep that in mind for any food-related suggestions. Is there anything else you'd like me to know about your dietary restrictions?\n\n[USER]: No, that's all. Can you suggest a snack for me?", "metadata": {"topic": "Food & Dietary", "challenge": "Memory Retention"}}
+{"uuid": "example_2", "task_id": 2, "agent_ref": {"type": "responses_api_agents", "name": "multichallenge_simple_agent"}, "responses_create_params": {"input": [{"role": "system", "content": "You are a helpful travel assistant. Always be concise and practical."}, {"role": "user", "content": "I'm planning a trip to Japan next month. I don't eat meat."}, {"role": "assistant", "content": "Great choice! Japan has wonderful vegetarian options. I'll note that you're vegetarian for restaurant recommendations."}, {"role": "user", "content": "Actually, I eat fish, just not meat."}, {"role": "assistant", "content": "Understood! So you're pescatarian - you eat fish but not meat. That's perfect for Japan since seafood is a major part of the cuisine. I've updated my notes."}, {"role": "user", "content": "What are some must-try dishes for me in Tokyo?"}]}, "rubric": [{"question": "Did the model recommend dishes that include fish/seafood but exclude meat?", "pass_criteria": "YES"}, {"question": "Did the model correctly remember the user is pescatarian (not vegetarian)?", "pass_criteria": "YES"}, {"question": "Were the recommendations specific to Tokyo?", "pass_criteria": "YES"}], "context": "[SYSTEM]: You are a helpful travel assistant. Always be concise and practical.\n\n[USER]: I'm planning a trip to Japan next month. I don't eat meat.\n\n[ASSISTANT]: Great choice! Japan has wonderful vegetarian options. I'll note that you're vegetarian for restaurant recommendations.\n\n[USER]: Actually, I eat fish, just not meat.\n\n[ASSISTANT]: Understood! So you're pescatarian - you eat fish but not meat. That's perfect for Japan since seafood is a major part of the cuisine. I've updated my notes.\n\n[USER]: What are some must-try dishes for me in Tokyo?", "metadata": {"topic": "Travel", "challenge": "Preference Update"}}
+{"uuid": "example_3", "task_id": 3, "agent_ref": {"type": "responses_api_agents", "name": "multichallenge_simple_agent"}, "responses_create_params": {"input": [{"role": "user", "content": "I need help with my project. The deadline is Friday and today is Monday."}, {"role": "assistant", "content": "You have 4 days until your Friday deadline. Let's break this down - what's the project about and what have you completed so far?"}, {"role": "user", "content": "It's a presentation about climate change. I've done the research but haven't started the slides yet."}, {"role": "assistant", "content": "Good progress on research! For a 4-day timeline, I suggest: Day 1-2 (Mon-Tue): Create outline and draft slides. Day 3 (Wed): Add visuals and refine content. Day 4 (Thu): Practice and final polish. Would you like help with any specific section?"}, {"role": "user", "content": "Yes, can you help me structure the introduction?"}]}, "rubric": [{"question": "Did the model provide advice relevant to creating a presentation introduction?", "pass_criteria": "YES"}, {"question": "Did the model's response acknowledge the climate change topic?", "pass_criteria": "YES"}], "context": "[USER]: I need help with my project. The deadline is Friday and today is Monday.\n\n[ASSISTANT]: You have 4 days until your Friday deadline. Let's break this down - what's the project about and what have you completed so far?\n\n[USER]: It's a presentation about climate change. I've done the research but haven't started the slides yet.\n\n[ASSISTANT]: Good progress on research! For a 4-day timeline, I suggest: Day 1-2 (Mon-Tue): Create outline and draft slides. Day 3 (Wed): Add visuals and refine content. Day 4 (Thu): Practice and final polish. Would you like help with any specific section?\n\n[USER]: Yes, can you help me structure the introduction?", "metadata": {"topic": "Productivity", "challenge": "Context Tracking"}}
+{"uuid": "example_3", "task_id": 3, "agent_ref": {"type": "responses_api_agents", "name": "multichallenge_simple_agent"}, "responses_create_params": {"input": [{"role": "user", "content": "I need help with my project. The deadline is Friday and today is Monday."}, {"role": "assistant", "content": "You have 4 days until your Friday deadline. Let's break this down - what's the project about and what have you completed so far?"}, {"role": "user", "content": "It's a presentation about climate change. I've done the research but haven't started the slides yet."}, {"role": "assistant", "content": "Good progress on research! For a 4-day timeline, I suggest: Day 1-2 (Mon-Tue): Create outline and draft slides. Day 3 (Wed): Add visuals and refine content. Day 4 (Thu): Practice and final polish. Would you like help with any specific section?"}, {"role": "user", "content": "Yes, can you help me structure the introduction?"}]}, "rubric": [{"question": "Did the model provide advice relevant to creating a presentation introduction?", "pass_criteria": "YES"}, {"question": "Did the model's response acknowledge the climate change topic?", "pass_criteria": "YES"}], "context": "[USER]: I need help with my project. The deadline is Friday and today is Monday.\n\n[ASSISTANT]: You have 4 days until your Friday deadline. Let's break this down - what's the project about and what have you completed so far?\n\n[USER]: It's a presentation about climate change. I've done the research but haven't started the slides yet.\n\n[ASSISTANT]: Good progress on research! For a 4-day timeline, I suggest: Day 1-2 (Mon-Tue): Create outline and draft slides. Day 3 (Wed): Add visuals and refine content. Day 4 (Thu): Practice and final polish. Would you like help with any specific section?\n\n[USER]: Yes, can you help me structure the introduction?", "metadata": {"topic": "Productivity", "challenge": "Context Tracking"}}
+{"uuid": "example_3", "task_id": 3, "agent_ref": {"type": "responses_api_agents", "name": "multichallenge_simple_agent"}, "responses_create_params": {"input": [{"role": "user", "content": "I need help with my project. The deadline is Friday and today is Monday."}, {"role": "assistant", "content": "You have 4 days until your Friday deadline. Let's break this down - what's the project about and what have you completed so far?"}, {"role": "user", "content": "It's a presentation about climate change. I've done the research but haven't started the slides yet."}, {"role": "assistant", "content": "Good progress on research! For a 4-day timeline, I suggest: Day 1-2 (Mon-Tue): Create outline and draft slides. Day 3 (Wed): Add visuals and refine content. Day 4 (Thu): Practice and final polish. Would you like help with any specific section?"}, {"role": "user", "content": "Yes, can you help me structure the introduction?"}]}, "rubric": [{"question": "Did the model provide advice relevant to creating a presentation introduction?", "pass_criteria": "YES"}, {"question": "Did the model's response acknowledge the climate change topic?", "pass_criteria": "YES"}], "context": "[USER]: I need help with my project. The deadline is Friday and today is Monday.\n\n[ASSISTANT]: You have 4 days until your Friday deadline. Let's break this down - what's the project about and what have you completed so far?\n\n[USER]: It's a presentation about climate change. I've done the research but haven't started the slides yet.\n\n[ASSISTANT]: Good progress on research! For a 4-day timeline, I suggest: Day 1-2 (Mon-Tue): Create outline and draft slides. Day 3 (Wed): Add visuals and refine content. Day 4 (Thu): Practice and final polish. Would you like help with any specific section?\n\n[USER]: Yes, can you help me structure the introduction?", "metadata": {"topic": "Productivity", "challenge": "Context Tracking"}}
diff --git a/resources_servers/multichallenge/data/example_metrics.json b/resources_servers/multichallenge/data/example_metrics.json
new file mode 100644
index 000000000..8f60d2d9d
--- /dev/null
+++ b/resources_servers/multichallenge/data/example_metrics.json
@@ -0,0 +1,53 @@
+{
+    "name": "multichallenge_example",
+    "type": "example",
+    "jsonl_fpath": "resources_servers/multichallenge/data/example.jsonl",
+    "num_repeats": 1,
+    "gitlab_identifier": null,
+    "huggingface_identifier": null,
+    "license": "Apache 2.0",
+    "Number of examples": 5,
+    "Number of tools": {
+        "Total # non-null values": 0,
+        "Average": 0.0,
+        "Min": 0.0,
+        "Max": 0.0,
+        "Standard deviation": 0.0
+    },
+    "Json-dumped number of words (proxy for token count)": {
+        "Total # non-null values": 5,
+        "Average": 105.6,
+        "Min": 70.0,
+        "Max": 119.0,
+        "Standard deviation": 21.37
+    },
+    "Number of turns": {
+        "Total # non-null values": 5,
+        "Average": 2.8,
+        "Min": 2.0,
+        "Max": 3.0,
+        "Standard deviation": 0.447
+    },
+    "Temperature": {
+        "Total # non-null values": 0,
+        "Average": 0.0,
+        "Min": 0.0,
+        "Max": 0.0,
+        "Standard deviation": 0.0
+    },
+    "uuid": {
+        "unique_count": 3,
+        "total_count": 5
+    },
+    "task_id": {
+        "Total # non-null values": 5,
+        "Average": 2.4,
+        "Min": 1.0,
+        "Max": 3.0,
+        "Standard deviation": 0.894
+    },
+    "context": {
+        "unique_count": 3,
+        "total_count": 5
+    }
+}
\ No newline at end of file
diff --git a/resources_servers/multichallenge/data/example_rollouts.jsonl b/resources_servers/multichallenge/data/example_rollouts.jsonl
new file mode 100644
index 000000000..7593a7dcc
--- /dev/null
+++ b/resources_servers/multichallenge/data/example_rollouts.jsonl
@@ -0,0 +1,5 @@
+{"uuid": "example_1", "task_id": 1, "agent_ref": {"type": "responses_api_agents", "name": "multichallenge_simple_agent"}, "responses_create_params": {"input": [{"role": "user", "content": "Hi! My name is Alex and I'm allergic to peanuts. Can you remember that?"}, {"role": "assistant", "content": "Of course, Alex! I've noted that you have a peanut allergy. I'll make sure to keep that in mind for any food-related suggestions. Is there anything else you'd like me to know about your dietary restrictions?"}, {"role": "user", "content": "No, that's all. Can you suggest a snack for me?"}]}, "rubric": [{"question": "Did the model correctly remember and avoid suggesting peanuts or peanut-containing snacks?", "pass_criteria": "YES"}, {"question": "Did the model address the user by their name (Alex)?", "pass_criteria": "YES"}], "context": "[USER]: Hi! My name is Alex and I'm allergic to peanuts. Can you remember that?\n\n[ASSISTANT]: Of course, Alex! I've noted that you have a peanut allergy. I'll make sure to keep that in mind for any food-related suggestions. Is there anything else you'd like me to know about your dietary restrictions?\n\n[USER]: No, that's all. Can you suggest a snack for me?", "metadata": {"topic": "Food & Dietary", "challenge": "Memory Retention"}}
+{"uuid": "example_2", "task_id": 2, "agent_ref": {"type": "responses_api_agents", "name": "multichallenge_simple_agent"}, "responses_create_params": {"input": [{"role": "system", "content": "You are a helpful travel assistant. Always be concise and practical."}, {"role": "user", "content": "I'm planning a trip to Japan next month. I don't eat meat."}, {"role": "assistant", "content": "Great choice! Japan has wonderful vegetarian options. I'll note that you're vegetarian for restaurant recommendations."}, {"role": "user", "content": "Actually, I eat fish, just not meat."}, {"role": "assistant", "content": "Understood! So you're pescatarian - you eat fish but not meat. That's perfect for Japan since seafood is a major part of the cuisine. I've updated my notes."}, {"role": "user", "content": "What are some must-try dishes for me in Tokyo?"}]}, "rubric": [{"question": "Did the model recommend dishes that include fish/seafood but exclude meat?", "pass_criteria": "YES"}, {"question": "Did the model correctly remember the user is pescatarian (not vegetarian)?", "pass_criteria": "YES"}, {"question": "Were the recommendations specific to Tokyo?", "pass_criteria": "YES"}], "context": "[SYSTEM]: You are a helpful travel assistant. Always be concise and practical.\n\n[USER]: I'm planning a trip to Japan next month. I don't eat meat.\n\n[ASSISTANT]: Great choice! Japan has wonderful vegetarian options. I'll note that you're vegetarian for restaurant recommendations.\n\n[USER]: Actually, I eat fish, just not meat.\n\n[ASSISTANT]: Understood! So you're pescatarian - you eat fish but not meat. That's perfect for Japan since seafood is a major part of the cuisine. I've updated my notes.\n\n[USER]: What are some must-try dishes for me in Tokyo?", "metadata": {"topic": "Travel", "challenge": "Preference Update"}}
+{"uuid": "example_3", "task_id": 3, "agent_ref": {"type": "responses_api_agents", "name": "multichallenge_simple_agent"}, "responses_create_params": {"input": [{"role": "user", "content": "I need help with my project. The deadline is Friday and today is Monday."}, {"role": "assistant", "content": "You have 4 days until your Friday deadline. Let's break this down - what's the project about and what have you completed so far?"}, {"role": "user", "content": "It's a presentation about climate change. I've done the research but haven't started the slides yet."}, {"role": "assistant", "content": "Good progress on research! For a 4-day timeline, I suggest: Day 1-2 (Mon-Tue): Create outline and draft slides. Day 3 (Wed): Add visuals and refine content. Day 4 (Thu): Practice and final polish. Would you like help with any specific section?"}, {"role": "user", "content": "Yes, can you help me structure the introduction?"}]}, "rubric": [{"question": "Did the model provide advice relevant to creating a presentation introduction?", "pass_criteria": "YES"}, {"question": "Did the model's response acknowledge the climate change topic?", "pass_criteria": "YES"}], "context": "[USER]: I need help with my project. The deadline is Friday and today is Monday.\n\n[ASSISTANT]: You have 4 days until your Friday deadline. Let's break this down - what's the project about and what have you completed so far?\n\n[USER]: It's a presentation about climate change. I've done the research but haven't started the slides yet.\n\n[ASSISTANT]: Good progress on research! For a 4-day timeline, I suggest: Day 1-2 (Mon-Tue): Create outline and draft slides. Day 3 (Wed): Add visuals and refine content. Day 4 (Thu): Practice and final polish. Would you like help with any specific section?\n\n[USER]: Yes, can you help me structure the introduction?", "metadata": {"topic": "Productivity", "challenge": "Context Tracking"}}
+{"uuid": "example_3", "task_id": 3, "agent_ref": {"type": "responses_api_agents", "name": "multichallenge_simple_agent"}, "responses_create_params": {"input": [{"role": "user", "content": "I need help with my project. The deadline is Friday and today is Monday."}, {"role": "assistant", "content": "You have 4 days until your Friday deadline. Let's break this down - what's the project about and what have you completed so far?"}, {"role": "user", "content": "It's a presentation about climate change. I've done the research but haven't started the slides yet."}, {"role": "assistant", "content": "Good progress on research! For a 4-day timeline, I suggest: Day 1-2 (Mon-Tue): Create outline and draft slides. Day 3 (Wed): Add visuals and refine content. Day 4 (Thu): Practice and final polish. Would you like help with any specific section?"}, {"role": "user", "content": "Yes, can you help me structure the introduction?"}]}, "rubric": [{"question": "Did the model provide advice relevant to creating a presentation introduction?", "pass_criteria": "YES"}, {"question": "Did the model's response acknowledge the climate change topic?", "pass_criteria": "YES"}], "context": "[USER]: I need help with my project. The deadline is Friday and today is Monday.\n\n[ASSISTANT]: You have 4 days until your Friday deadline. Let's break this down - what's the project about and what have you completed so far?\n\n[USER]: It's a presentation about climate change. I've done the research but haven't started the slides yet.\n\n[ASSISTANT]: Good progress on research! For a 4-day timeline, I suggest: Day 1-2 (Mon-Tue): Create outline and draft slides. Day 3 (Wed): Add visuals and refine content. Day 4 (Thu): Practice and final polish. Would you like help with any specific section?\n\n[USER]: Yes, can you help me structure the introduction?", "metadata": {"topic": "Productivity", "challenge": "Context Tracking"}}
+{"uuid": "example_3", "task_id": 3, "agent_ref": {"type": "responses_api_agents", "name": "multichallenge_simple_agent"}, "responses_create_params": {"input": [{"role": "user", "content": "I need help with my project. The deadline is Friday and today is Monday."}, {"role": "assistant", "content": "You have 4 days until your Friday deadline. Let's break this down - what's the project about and what have you completed so far?"}, {"role": "user", "content": "It's a presentation about climate change. I've done the research but haven't started the slides yet."}, {"role": "assistant", "content": "Good progress on research! For a 4-day timeline, I suggest: Day 1-2 (Mon-Tue): Create outline and draft slides. Day 3 (Wed): Add visuals and refine content. Day 4 (Thu): Practice and final polish. Would you like help with any specific section?"}, {"role": "user", "content": "Yes, can you help me structure the introduction?"}]}, "rubric": [{"question": "Did the model provide advice relevant to creating a presentation introduction?", "pass_criteria": "YES"}, {"question": "Did the model's response acknowledge the climate change topic?", "pass_criteria": "YES"}], "context": "[USER]: I need help with my project. The deadline is Friday and today is Monday.\n\n[ASSISTANT]: You have 4 days until your Friday deadline. Let's break this down - what's the project about and what have you completed so far?\n\n[USER]: It's a presentation about climate change. I've done the research but haven't started the slides yet.\n\n[ASSISTANT]: Good progress on research! For a 4-day timeline, I suggest: Day 1-2 (Mon-Tue): Create outline and draft slides. Day 3 (Wed): Add visuals and refine content. Day 4 (Thu): Practice and final polish. Would you like help with any specific section?\n\n[USER]: Yes, can you help me structure the introduction?", "metadata": {"topic": "Productivity", "challenge": "Context Tracking"}}
diff --git a/resources_servers/multichallenge/dataset_preprocess.py b/resources_servers/multichallenge/dataset_preprocess.py
new file mode 100644
index 000000000..d653fb611
--- /dev/null
+++ b/resources_servers/multichallenge/dataset_preprocess.py
@@ -0,0 +1,246 @@
+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+#!/usr/bin/env python3
+"""
+Preprocesses MultiChallenge dataset to the format required by NeMo-Gym.
+
+Supports two input modes:
+1. JSONL mode (default): Reads from pre-compiled .jsonl files
+   Input: data/{split}.jsonl  ->  Output: data/{split}_preprocessed.jsonl
+
+2. JSON directory mode: Reads from individual JSON files
+   Input: data/{split}/*.json  ->  Output: data/{split}.jsonl
+
+Each output line contains the task data formatted for the simple_agent.
+"""
+
+import argparse
+import json
+from pathlib import Path
+from typing import Any
+
+
+# Hardcoded path for raw multichallenge data
+DEFAULT_RAW_DATA_DIR = Path("/lustre/fsw/portfolios/llmservice/users/mfathi/data/multichallenge")
+
+
+def build_input_messages(task: dict) -> list[dict]:
+    """
+    Build the input messages for the policy model from the task data.
+
+    Excludes 'thinking' role messages and the final user message (which the model should respond to).
+    """
+    messages = task.get("messages", [])
+    system_prompt = task.get("system", None)
+
+    input_msgs = []
+
+    # Add system message if present
+    if system_prompt:
+        input_msgs.append({"role": "system", "content": system_prompt})
+
+    # Add all messages (the agent will handle the conversation flow)
+    for msg in messages:
+        role = msg.get("role", "")
+        content = msg.get("content", "")
+
+        # Skip thinking messages - these shouldn't be sent to the policy model
+        if role == "thinking":
+            continue
+
+        input_msgs.append({"role": role, "content": content})
+
+    return input_msgs
+
+
+def build_context_string(task: dict) -> str:
+    """Build a readable context string from messages for the judge."""
+    messages = task.get("messages", [])
+    system_prompt = task.get("system", None)
+
+    context_parts = []
+
+    if system_prompt:
+        context_parts.append(f"[SYSTEM]: {system_prompt}")
+
+    for msg in messages:
+        role = msg.get("role", "unknown")
+        content = msg.get("content", "")
+
+        # Skip thinking messages
+        if role == "thinking":
+            continue
+
+        role_label = role.upper()
+        context_parts.append(f"[{role_label}]: {content}")
+
+    return "\n\n".join(context_parts)
+
+
+def process_task(task: dict, fallback_id: str = "unknown") -> dict[str, Any]:
+    """Process a single task dict into the preprocessed JSONL format."""
+    metadata = task.get("metadata", {})
+    task_id = metadata.get("taskId", fallback_id)
+
+    # Build the record for JSONL
+    record = {
+        "uuid": str(task_id),
+        "task_id": task_id,
+        # Agent reference - tells NeMo-Gym which agent to route this to
+        "agent_ref": {
+            "type": "responses_api_agents",
+            "name": "multichallenge_simple_agent",
+        },
+        # Input messages wrapped in responses_create_params (required by ng_collect_rollouts)
+        "responses_create_params": {
+            "input": build_input_messages(task),
+        },
+        # Rubric for evaluation
+        "rubric": task.get("rubric", []),
+        # Pre-built context string for the judge
+        "context": build_context_string(task),
+        # Full metadata
+        "metadata": {
+            **metadata,
+            "messages": task.get("messages", []),
+            "system": task.get("system", None),
+            "ground_truth_answer": task.get("ground_truth_answer", None),
+        },
+    }
+
+    return record
+
+
+def process_task_file(filepath: Path) -> dict[str, Any]:
+    """Process a single task JSON file into JSONL format."""
+    with open(filepath, "r", encoding="utf-8") as f:
+        task = json.load(f)
+    return process_task(task, fallback_id=filepath.stem)
+
+
+def process_jsonl_file(input_file: Path, output_file: Path) -> int:
+    """Process a JSONL file where each line is a task."""
+    count = 0
+    errors = 0
+
+    print(f"Processing JSONL file: {input_file}")
+
+    with open(input_file, "r", encoding="utf-8") as in_f, open(output_file, "w", encoding="utf-8") as out_f:
+        for line_num, line in enumerate(in_f, start=1):
+            line = line.strip()
+            if not line:
+                continue
+            try:
+                task = json.loads(line)
+                record = process_task(task, fallback_id=f"line_{line_num}")
+                out_f.write(json.dumps(record, ensure_ascii=False) + "\n")
+                count += 1
+            except json.JSONDecodeError as e:
+                print(f"  Warning: Invalid JSON on line {line_num}: {e}")
+                errors += 1
+            except Exception as e:
+                print(f"  Error processing line {line_num}: {e}")
+                errors += 1
+
+    print(f"  Wrote {count} records to {output_file}" + (f" ({errors} errors)" if errors else ""))
+    return count
+
+
+def process_split_jsonl(data_dir: Path, split: str, output_dir: Path) -> int:
+    """Process a split from a JSONL file."""
+    input_file = data_dir / f"{split}.jsonl"
+    if not input_file.exists():
+        print(f"Warning: JSONL file not found: {input_file}")
+        return 0
+
+    output_file = output_dir / f"{split}.jsonl"
+    return process_jsonl_file(input_file, output_file)
+
+
+def process_split_json_dir(data_dir: Path, split: str, output_dir: Path) -> int:
+    """Process all JSON files in a split directory."""
+    split_dir = data_dir / split
+    if not split_dir.exists():
+        print(f"Warning: Split directory not found: {split_dir}")
+        return 0
+
+    output_file = output_dir / f"{split}.jsonl"
+    count = 0
+
+    json_files = sorted(split_dir.glob("*.json"))
+    print(f"Processing {len(json_files)} files from {split}...")
+
+    with open(output_file, "w", encoding="utf-8") as out_f:
+        for filepath in json_files:
+            try:
+                record = process_task_file(filepath)
+                out_f.write(json.dumps(record, ensure_ascii=False) + "\n")
+                count += 1
+            except Exception as e:
+                print(f"Error processing {filepath}: {e}")
+
+    print(f"Wrote {count} records to {output_file}")
+    return count
+
+
+def main():
+    parser = argparse.ArgumentParser(description="Convert MultiChallenge data to NeMo-Gym JSONL format")
+    parser.add_argument(
+        "--data-dir",
+        type=Path,
+        default=DEFAULT_RAW_DATA_DIR,
+        help=f"Directory containing the data (default: {DEFAULT_RAW_DATA_DIR})",
+    )
+    parser.add_argument(
+        "--output-dir",
+        type=Path,
+        default=Path(__file__).parent / "data",
+        help="Output directory for preprocessed JSONL files (default: ./data)",
+    )
+    parser.add_argument(
+        "--splits",
+        nargs="+",
+        default=["advanced", "vanilla"],
+        help="Splits to process (default: advanced vanilla)",
+    )
+    parser.add_argument(
+        "--mode",
+        choices=["jsonl", "json-dir"],
+        default="jsonl",
+        help="Input mode: 'jsonl' reads {split}.jsonl files, 'json-dir' reads {split}/*.json directories (default: jsonl)",
+    )
+    args = parser.parse_args()
+
+    args.output_dir.mkdir(parents=True, exist_ok=True)
+
+    print(f"Input directory: {args.data_dir}")
+    print(f"Output directory: {args.output_dir}")
+    print(f"Mode: {args.mode}")
+    print(f"Splits: {args.splits}")
+    print()
+
+    total = 0
+    for split in args.splits:
+        if args.mode == "jsonl":
+            total += process_split_jsonl(args.data_dir, split, args.output_dir)
+        else:
+            total += process_split_json_dir(args.data_dir, split, args.output_dir)
+
+    print(f"\nTotal: {total} records processed")
+
+
+if __name__ == "__main__":
+    main()
diff --git a/resources_servers/multichallenge/requirements.txt b/resources_servers/multichallenge/requirements.txt
new file mode 100644
index 000000000..00ed83213
--- /dev/null
+++ b/resources_servers/multichallenge/requirements.txt
@@ -0,0 +1 @@
+-e nemo-gym[dev] @ ../../
diff --git a/resources_servers/multichallenge/tests/__init__.py b/resources_servers/multichallenge/tests/__init__.py
new file mode 100644
index 000000000..1a8431c3e
--- /dev/null
+++ b/resources_servers/multichallenge/tests/__init__.py
@@ -0,0 +1,2 @@
+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
diff --git a/resources_servers/multichallenge/tests/test_multichallenge.py b/resources_servers/multichallenge/tests/test_multichallenge.py
new file mode 100644
index 000000000..37bbba7d7
--- /dev/null
+++ b/resources_servers/multichallenge/tests/test_multichallenge.py
@@ -0,0 +1,136 @@
+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import pytest
+
+from resources_servers.multichallenge.app import (
+    AggregationMode,
+    MultiChallengeConfig,
+    RubricEvaluation,
+    _build_context_from_messages,
+    _extract_verdict,
+)
+
+
+class TestMultiChallenge:
+    """Tests for MultiChallenge environment utilities."""
+
+    def test_extract_verdict_yes(self):
+        """Test extracting YES verdict."""
+        response = "After analysis, the model correctly addressed the user's allergy. [[YES]]"
+        verdict = _extract_verdict(response, "[[YES]]", "[[NO]]")
+        assert verdict == "YES"
+
+    def test_extract_verdict_no(self):
+        """Test extracting NO verdict."""
+        response = "The model failed to remember the allergy. [[NO]]"
+        verdict = _extract_verdict(response, "[[YES]]", "[[NO]]")
+        assert verdict == "NO"
+
+    def test_extract_verdict_fallback(self):
+        """Test fallback when no label present."""
+        response = "The model did well.\nYES"
+        verdict = _extract_verdict(response, "[[YES]]", "[[NO]]")
+        assert verdict == "YES"
+
+    def test_extract_verdict_last_wins(self):
+        """Test that last label wins when both present."""
+        response = "Initially [[YES]] but actually [[NO]]"
+        verdict = _extract_verdict(response, "[[YES]]", "[[NO]]")
+        assert verdict == "NO"
+
+    def test_build_context_excludes_thinking(self):
+        """Test that thinking messages are excluded from context."""
+        messages = [
+            {"role": "user", "content": "Hello"},
+            {"role": "thinking", "content": "Processing..."},
+            {"role": "assistant", "content": "Hi there!"},
+        ]
+        context = _build_context_from_messages(messages, exclude_thinking=True)
+        assert "Processing" not in context
+        assert "[USER]: Hello" in context
+        assert "[ASSISTANT]: Hi there!" in context
+
+    def test_build_context_includes_thinking(self):
+        """Test that thinking messages can be included."""
+        messages = [
+            {"role": "user", "content": "Hello"},
+            {"role": "thinking", "content": "Processing..."},
+            {"role": "assistant", "content": "Hi there!"},
+        ]
+        context = _build_context_from_messages(messages, exclude_thinking=False)
+        assert "[THINKING]: Processing" in context
+
+
+class TestAggregation:
+    """Tests for score aggregation."""
+
+    def create_evaluations(self, scores: list[float]) -> list[RubricEvaluation]:
+        """Create mock evaluations with given scores."""
+        return [
+            RubricEvaluation(
+                question=f"Q{i}",
+                pass_criteria="YES",
+                judge_prompt="...",
+                judge_response="...",
+                verdict="YES" if s >= 0.99 else "NO",
+                score=s,
+                weight=1.0,
+            )
+            for i, s in enumerate(scores)
+        ]
+
+    def test_aggregation_modes(self):
+        """Test various aggregation modes."""
+        from unittest.mock import MagicMock
+
+        from nemo_gym.config_types import ModelServerRef
+        from nemo_gym.openai_utils import NeMoGymResponseCreateParamsNonStreaming
+        from nemo_gym.server_utils import ServerClient
+        from resources_servers.multichallenge.app import MultiChallengeServer
+
+        config = MultiChallengeConfig(
+            host="",
+            port=0,
+            entrypoint="",
+            name="test",
+            judge_model_server=ModelServerRef(type="responses_api_models", name="test"),
+            judge_responses_create_params=NeMoGymResponseCreateParamsNonStreaming(input=[]),
+        )
+
+        # Create a proper mock that passes pydantic validation
+        mock_client = MagicMock(spec=ServerClient)
+        server = MultiChallengeServer.model_construct(config=config, server_client=mock_client)
+        evaluations = self.create_evaluations([1.0, 0.5, 0.0])
+
+        # Test MEAN
+        config.aggregation_mode = AggregationMode.MEAN
+        assert server._aggregate_scores(evaluations) == pytest.approx(0.5)
+
+        # Test MIN
+        config.aggregation_mode = AggregationMode.MIN
+        assert server._aggregate_scores(evaluations) == 0.0
+
+        # Test MAX
+        config.aggregation_mode = AggregationMode.MAX
+        assert server._aggregate_scores(evaluations) == 1.0
+
+        # Test ALL (only first passes)
+        config.aggregation_mode = AggregationMode.ALL
+        assert server._aggregate_scores(evaluations) == 0.0
+
+        # Test ANY (first passes)
+        config.aggregation_mode = AggregationMode.ANY
+        assert server._aggregate_scores(evaluations) == 1.0
diff --git a/resources_servers/swerl_gen/eval/singularity_utils.py b/resources_servers/swerl_gen/eval/singularity_utils.py
index 3a8ed8528..0a1252f32 100644
--- a/resources_servers/swerl_gen/eval/singularity_utils.py
+++ b/resources_servers/swerl_gen/eval/singularity_utils.py
@@ -200,7 +200,10 @@ def _run_instance(
 
 
 # Using SPREAD scheduling so that Ray assigns tasks to as many distinct nodes as possible.
-@ray.remote(scheduling_strategy="SPREAD")
+@ray.remote(
+    scheduling_strategy="SPREAD",
+    runtime_env={"env_vars": {"PYTHONPATH": "/opt/nemo-rl/3rdparty/Gym-workspace/Gym"}},
+)
 def compute_score(
     extra_info_base64: str,
     patch_str: str,