diff --git a/README.md b/README.md index eb29dc65d..162ca6b80 100644 --- a/README.md +++ b/README.md @@ -160,7 +160,6 @@ Purpose: Training-ready environments with curated datasets. | Mini Swe Agent | coding | SWE-Gym | A software development with mini-swe-agent orchestration | Improve software development capabilities, like SWE-bench | config | ✓ | ✓ | MIT | | Instruction Following | instruction_following | Nemotron-RL-instruction_following | Instruction following datasets targeting IFEval and IFBench style instruction following capabilities | Improve IFEval and IFBench | config | ✓ | - | Apache 2.0 | | Structured Outputs | instruction_following | Nemotron-RL-instruction_following-structured_outputs | Check if responses are following structured output requirements in prompts | Improve instruction following capabilities | config | ✓ | ✓ | Apache 2.0 | -| Equivalence Llm Judge | knowledge | Nemotron-RL-knowledge-openQA | Short answer questions with LLM-as-a-judge | Improve knowledge-related benchmarks like GPQA / HLE | config | ✓ | - | Apache 2.0 | | Mcqa | knowledge | Nemotron-RL-knowledge-mcqa | Multi-choice question answering problems | Improve benchmarks like MMLU / GPQA / HLE | config | ✓ | - | Apache 2.0 | | Math With Judge | math | Nemotron-RL-math-OpenMathReasoning | Math dataset with math-verify and LLM-as-a-judge | Improve math capabilities including AIME 24 / 25 | config | ✓ | ✓ | Creative Commons Attribution 4.0 International | | Math With Judge | math | Nemotron-RL-math-stack_overflow | - | - | config | ✓ | ✓ | Creative Commons Attribution-ShareAlike 4.0 International | diff --git a/resources_servers/calendar/app.py b/resources_servers/calendar/app.py index c844d7f4c..f0afeab64 100644 --- a/resources_servers/calendar/app.py +++ b/resources_servers/calendar/app.py @@ -47,7 +47,27 @@ def setup_webserver(self) -> FastAPI: return app async def verify(self, body: CalendarVerifyRequest) -> BaseVerifyResponse: - assistant_response = body.response.output[-1].content[0].text + # Extract the assistant's text response from the last output item. + # + # For reasoning models (e.g., with deepseek_r1 reasoning_parser), the output + # structure is: [ReasoningItem, MessageItem] where: + # - ReasoningItem: has .reasoning attribute (thinking/CoT tokens) + # - MessageItem: has .content attribute (actual response text) + # + # The last item should be a MessageItem with .content, but if the model + # hit the token limit while still thinking, the last item will be a + # ReasoningItem without .content. In that case, we return reward=0. + assistant_response = "" + if body.response.output: + last_output = body.response.output[-1] + if hasattr(last_output, "content") and last_output.content: + assistant_response = last_output.content[0].text + + # If no valid response (e.g., model only produced thinking tokens), + # return zero reward + if not assistant_response: + return BaseVerifyResponse(**body.model_dump(), reward=0) + exp_cal_state = body.exp_cal_state try: reward, reason = grade_assistant_response(assistant_response, exp_cal_state) diff --git a/resources_servers/equivalence_llm_judge/app.py b/resources_servers/equivalence_llm_judge/app.py index 3ff412310..a8da06f93 100644 --- a/resources_servers/equivalence_llm_judge/app.py +++ b/resources_servers/equivalence_llm_judge/app.py @@ -21,7 +21,9 @@ # limitations under the License. from __future__ import annotations +import asyncio import re +from contextlib import nullcontext from typing import Any, Optional from fastapi import FastAPI @@ -59,8 +61,11 @@ class LLMJudgeResourcesServerConfig(BaseResourcesServerConfig): judge_model_server: ModelServerRef judge_responses_create_params: NeMoGymResponseCreateParamsNonStreaming + # Concurrency limit for judge endpoint requests. Set to None to disable limiting. + judge_endpoint_max_concurrency: Optional[int] = 64 + judge_system_message: Optional[str] = None - judge_prompt_template: str + judge_prompt_template_fpath: str = "prompt_templates/equivalence_llm_judge.txt" judge_equal_label: str = "[[A=B]]" judge_not_equal_label: str = "[[A!=B]]" # Optional regex to extract the question from the last user message. @@ -250,6 +255,17 @@ class LLMJudgeResourcesServer(SimpleResourcesServer): config: LLMJudgeResourcesServerConfig + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + + if self.config.judge_endpoint_max_concurrency is not None: + self._judge_endpoint_max_concurrency = asyncio.Semaphore(value=self.config.judge_endpoint_max_concurrency) + else: + self._judge_endpoint_max_concurrency = nullcontext() + + with open(self.config.judge_prompt_template_fpath, "r") as f: + self._judge_prompt_template = f.read().strip() + def setup_webserver(self) -> FastAPI: app = super().setup_webserver() return app @@ -420,7 +436,7 @@ async def _generate_judge_evaluation( not_equal_label = cfg.judge_not_equal_label responses_create_params = cfg.judge_responses_create_params.model_copy(deep=True) - prompt_template = cfg.judge_prompt_template + prompt_template = self._judge_prompt_template system_message = cfg.judge_system_message user_prompt = prompt_template.format( @@ -433,12 +449,21 @@ async def _generate_judge_evaluation( msgs.append(NeMoGymEasyInputMessage(role="user", content=user_prompt)) responses_create_params.input = msgs - response = await self.server_client.post( - server_name=cfg.judge_model_server.name, - url_path="/v1/responses", - json=responses_create_params, - ) - judge_response = NeMoGymResponse.model_validate(await get_response_json(response)) + async with self._judge_endpoint_max_concurrency: + try: + response = await self.server_client.post( + server_name=cfg.judge_model_server.name, + url_path="/v1/responses", + json=responses_create_params, + ) + judge_response = NeMoGymResponse.model_validate(await get_response_json(response)) + except Exception as e: + print( + f"DEBUG: LLMJudgeResourcesServer: judge model server HTTP POST error: {type(e).__name__} {e}", + flush=True, + ) + raise e + eval_record = JudgeEvaluation( responses_create_params=responses_create_params, response=judge_response, diff --git a/resources_servers/equivalence_llm_judge/configs/equivalence_llm_judge.yaml b/resources_servers/equivalence_llm_judge/configs/equivalence_llm_judge.yaml index 55bfb96d2..d2275fe85 100644 --- a/resources_servers/equivalence_llm_judge/configs/equivalence_llm_judge.yaml +++ b/resources_servers/equivalence_llm_judge/configs/equivalence_llm_judge.yaml @@ -7,64 +7,9 @@ equivalence_llm_judge: name: policy_model judge_responses_create_params: input: [] - judge_prompt_template: |- - ===== System role ===== - You are a meticulous STEM grader. Compare a candidate answer to a GOLD reference for a scientific question and decide strict equivalence. - - Grading priorities (in order): - 1) Factual equivalence to GOLD (accept algebraically/formally equivalent formulations). - 2) Completeness on required parts — the candidate must include the same core parts/subclaims as the GOLD. - - Rules: - - Treat GOLD as authoritative for what counts as correct. - - If GOLD is a range or set, the candidate is equivalent only if it lies within that range or is a member of that set. - - For formulas/derivations, accept mathematically identical transformations (e.g., symbol reordering, factoring, equivalent identities). - - Multi-part: all essential parts must match for “equivalent”; otherwise they are not equivalent. - - Be concise. Do NOT reveal or rewrite the GOLD. - - Show your reason why they are equivalent or not equivalent first and then provide the output. - - Output (at the end after double newlines): - - If equivalent: [[A=B]] they are equivalent - - If not equivalent: [[A!=B]] they are not equivalent - - ===== Example 1 (equivalent) ===== - QUESTION: - State Avogadro’s constant (include units). - - GOLD: - 6.022 × 10^23 mol^-1 - - CANDIDATE: - 6.022e23 per mole. - - The candidate gives the same magnitude in scientific notation and the same “per mole” unit; no extra or missing qualifiers. - - [[A=B]] they are equivalent - - ===== Example 2 (not equivalent) ===== - QUESTION: - State the first law of thermodynamics for a closed system and identify what each symbol represents. - - GOLD: - ΔU = Q − W ; ΔU is change in internal energy, Q is heat added to the system, W is work done by the system. - - CANDIDATE: - ΔU = Q + W ; ΔU is change in internal energy, Q is heat added to the system, W is work done by the system. - - The candidate uses the opposite sign convention for work relative to the required relationship; one core part is incorrect, so the overall statement does not match. - - [[A!=B]] they are not equivalent - - ===== Inputs ===== - QUESTION: - {question} - - GOLD: - {expected_answer} - - CANDIDATE: - {generated_answer} + judge_prompt_template_fpath: prompt_templates/equivalence_llm_judge.txt + # Concurrency and rate limiting configuration + judge_endpoint_max_concurrency: 64 # Reduced from 64 to avoid rate limits judge_system_message: null judge_equal_label: "[[A=B]]" judge_not_equal_label: "[[A!=B]]" @@ -135,7 +80,7 @@ equivalence_llm_judge_simple_agent: license: "TBD" jsonl_fpath: resources_servers/equivalence_llm_judge/data/example_openqa.jsonl - name: train - type: train + type: example license: Apache 2.0 jsonl_fpath: resources_servers/equivalence_llm_judge/data/train.jsonl huggingface_identifier: diff --git a/resources_servers/equivalence_llm_judge/configs/lc.yaml b/resources_servers/equivalence_llm_judge/configs/lc.yaml new file mode 120000 index 000000000..b8218928a --- /dev/null +++ b/resources_servers/equivalence_llm_judge/configs/lc.yaml @@ -0,0 +1 @@ +lc_judge.yaml \ No newline at end of file diff --git a/resources_servers/equivalence_llm_judge/configs/lc_judge.yaml b/resources_servers/equivalence_llm_judge/configs/lc_judge.yaml new file mode 100644 index 000000000..db3843b50 --- /dev/null +++ b/resources_servers/equivalence_llm_judge/configs/lc_judge.yaml @@ -0,0 +1,35 @@ +lc_judge: + resources_servers: + equivalence_llm_judge: + entrypoint: app.py + judge_model_server: + type: responses_api_models + name: policy_model + judge_responses_create_params: + input: [] + judge_prompt_template_fpath: prompt_templates/lc_judge.txt + judge_endpoint_max_concurrency: 64 + judge_system_message: null + judge_equal_label: CORRECT + judge_not_equal_label: INCORRECT + check_twice_swap: false + reward_if_swap_fails: 0.0 + question_extract_regex: ^QUESTION:\s*(.*)$ + response_extract_regex: null + domain: knowledge + verified: false +lc_judge_simple_agent: + responses_api_agents: + simple_agent: + entrypoint: app.py + resources_server: + type: resources_servers + name: lc_judge + model_server: + type: responses_api_models + name: policy_model + datasets: + - name: example + type: example + license: TBD + jsonl_fpath: resources_servers/equivalence_llm_judge/data/example.jsonl diff --git a/resources_servers/equivalence_llm_judge/prompt_templates/equivalence_llm_judge.txt b/resources_servers/equivalence_llm_judge/prompt_templates/equivalence_llm_judge.txt new file mode 100644 index 000000000..7a7de001b --- /dev/null +++ b/resources_servers/equivalence_llm_judge/prompt_templates/equivalence_llm_judge.txt @@ -0,0 +1,58 @@ + +===== System role ===== +You are a meticulous STEM grader. Compare a candidate answer to a GOLD reference for a scientific question and decide strict equivalence. + +Grading priorities (in order): +1) Factual equivalence to GOLD (accept algebraically/formally equivalent formulations). +2) Completeness on required parts — the candidate must include the same core parts/subclaims as the GOLD. + +Rules: +- Treat GOLD as authoritative for what counts as correct. +- If GOLD is a range or set, the candidate is equivalent only if it lies within that range or is a member of that set. +- For formulas/derivations, accept mathematically identical transformations (e.g., symbol reordering, factoring, equivalent identities). +- Multi-part: all essential parts must match for “equivalent”; otherwise they are not equivalent. +- Be concise. Do NOT reveal or rewrite the GOLD. + +Show your reason why they are equivalent or not equivalent first and then provide the output. + +Output (at the end after double newlines): +- If equivalent: [[A=B]] they are equivalent +- If not equivalent: [[A!=B]] they are not equivalent + +===== Example 1 (equivalent) ===== +QUESTION: +State Avogadro’s constant (include units). + +GOLD: +6.022 × 10^23 mol^-1 + +CANDIDATE: +6.022e23 per mole. + +The candidate gives the same magnitude in scientific notation and the same “per mole” unit; no extra or missing qualifiers. + +[[A=B]] they are equivalent + +===== Example 2 (not equivalent) ===== +QUESTION: +State the first law of thermodynamics for a closed system and identify what each symbol represents. + +GOLD: +ΔU = Q − W ; ΔU is change in internal energy, Q is heat added to the system, W is work done by the system. + +CANDIDATE: +ΔU = Q + W ; ΔU is change in internal energy, Q is heat added to the system, W is work done by the system. + +The candidate uses the opposite sign convention for work relative to the required relationship; one core part is incorrect, so the overall statement does not match. + +[[A!=B]] they are not equivalent + +===== Inputs ===== +QUESTION: +{question} + +GOLD: +{expected_answer} + +CANDIDATE: +{generated_answer} \ No newline at end of file diff --git a/resources_servers/equivalence_llm_judge/prompt_templates/lc.txt b/resources_servers/equivalence_llm_judge/prompt_templates/lc.txt new file mode 120000 index 000000000..b200b884c --- /dev/null +++ b/resources_servers/equivalence_llm_judge/prompt_templates/lc.txt @@ -0,0 +1 @@ +lc_judge.txt \ No newline at end of file diff --git a/resources_servers/equivalence_llm_judge/prompt_templates/lc_judge.txt b/resources_servers/equivalence_llm_judge/prompt_templates/lc_judge.txt new file mode 100644 index 000000000..bb44669d3 --- /dev/null +++ b/resources_servers/equivalence_llm_judge/prompt_templates/lc_judge.txt @@ -0,0 +1,9 @@ + +Assess whether the following CANDIDATE ANSWER is CORRECT or INCORRECT. +For the CANDIDATE ANSWER to be correct, it must be consistent with the OFFICIAL ANSWER. + +The question, for reference only: {question} +The OFFICIAL ANSWER: {expected_answer} +CANDIDATE ANSWER TO ASSESS: {generated_answer} + +Reply only with CORRECT or INCORRECT. \ No newline at end of file diff --git a/resources_servers/equivalence_llm_judge/tests/test_app.py b/resources_servers/equivalence_llm_judge/tests/test_app.py index 83d875ab7..d824d2c9b 100644 --- a/resources_servers/equivalence_llm_judge/tests/test_app.py +++ b/resources_servers/equivalence_llm_judge/tests/test_app.py @@ -16,7 +16,6 @@ from pathlib import Path from unittest.mock import AsyncMock, MagicMock -from omegaconf import OmegaConf from pytest import approx, fixture from nemo_gym.config_types import ModelServerRef @@ -38,11 +37,8 @@ class TestApp: @fixture def config(self) -> LLMJudgeResourcesServerConfig: - # Load judge template from YAML so tests mirror runtime config - yaml_path = Path(__file__).resolve().parents[1] / "configs" / "equivalence_llm_judge.yaml" - yaml_cfg = OmegaConf.load(str(yaml_path)) - judge_template: str = ( - yaml_cfg.equivalence_llm_judge.resources_servers.equivalence_llm_judge.judge_prompt_template + judge_prompt_template_fpath = str( + Path(__file__).resolve().parents[1] / "prompt_templates/equivalence_llm_judge.txt" ) cfg = LLMJudgeResourcesServerConfig( @@ -51,7 +47,7 @@ def config(self) -> LLMJudgeResourcesServerConfig: entrypoint="", judge_model_server=ModelServerRef(type="responses_api_models", name="judge"), judge_responses_create_params=NeMoGymResponseCreateParamsNonStreaming(input=[]), - judge_prompt_template=judge_template, + judge_prompt_template_fpath=judge_prompt_template_fpath, ) cfg.judge_equal_label = "[[A=B]]" cfg.judge_not_equal_label = "[[A!=B]]" diff --git a/resources_servers/multichallenge/.gitignore b/resources_servers/multichallenge/.gitignore new file mode 100644 index 000000000..ef5570e77 --- /dev/null +++ b/resources_servers/multichallenge/.gitignore @@ -0,0 +1,6 @@ +# Ignore data files (datasets should not be committed) +data/advanced/ +data/vanilla/ +*.json +*.jsonl +!configs/*.yaml diff --git a/resources_servers/multichallenge/README.md b/resources_servers/multichallenge/README.md new file mode 100644 index 000000000..6e877d942 --- /dev/null +++ b/resources_servers/multichallenge/README.md @@ -0,0 +1,313 @@ +# MultiChallenge Environment + +Evaluates model responses on the **MultiChallenge** benchmark using an LLM judge. This benchmark assesses multi-turn conversation quality through rubric-based evaluation. + +## Quick Start + +```bash +# 1. Run unit tests +ng_test +entrypoint=resources_servers/multichallenge + +# 2. Start servers (in terminal 1) +config_paths="resources_servers/multichallenge/configs/multichallenge.yaml,responses_api_models/vllm_model/configs/vllm_model.yaml" +ng_run "+config_paths=[${config_paths}]" + +# 3. Collect rollouts on example data (in terminal 2) +ng_collect_rollouts \ + +agent_name=multichallenge_simple_agent \ + +input_jsonl_fpath=resources_servers/multichallenge/data/example.jsonl \ + +output_jsonl_fpath=/tmp/multichallenge_rollouts.jsonl +``` + +## Overview + +Each MultiChallenge task contains: +- **Conversation context**: A multi-turn dialogue between user and assistant +- **Rubric**: A set of yes/no questions evaluating the final response quality +- **Metadata**: Task information including topic, challenge type, and persona + +The environment: +1. Feeds the conversation context to the policy model +2. Retrieves the final response (excluding thinking/reasoning blocks) +3. Evaluates each rubric question using an LLM judge +4. Aggregates scores using a configurable method (mean, min, all, etc.) + +## Data Preparation + +### Option A: Use Example Data Only (Quick Testing) + +The `data/example.jsonl` file contains 3 synthetic tasks ready to use: + +```bash +# No preprocessing needed - just run +ng_collect_rollouts \ + +agent_name=multichallenge_simple_agent \ + +input_jsonl_fpath=resources_servers/multichallenge/data/example.jsonl \ + +output_jsonl_fpath=/tmp/test_rollouts.jsonl +``` + +### Option B: Full Dataset Setup + +> **Important**: Run the preprocessing script **before launching training jobs**. +> The preprocessed JSONL files must exist in `data/` for the training pipeline to work. + +1. **Preprocess to JSONL format**: + ```bash + # Run from the multichallenge directory + cd resources_servers/multichallenge + python dataset_preprocess.py + ``` + + This reads from the raw data directory and outputs: + - `data/advanced.jsonl` (994 tasks) + - `data/vanilla.jsonl` (1023 tasks) + + The script supports two input modes: + - `--mode jsonl` (default): Reads pre-compiled `{split}.jsonl` files + - `--mode json-dir`: Reads individual `{split}/*.json` files from directories + + ```bash + # Custom input/output paths + python dataset_preprocess.py \ + --data-dir /path/to/raw/data \ + --output-dir ./data \ + --splits advanced vanilla + ``` + +2. **Run on full dataset**: + ```bash + ng_collect_rollouts \ + +agent_name=multichallenge_simple_agent \ + +input_jsonl_fpath=resources_servers/multichallenge/data/advanced.jsonl \ + +output_jsonl_fpath=/tmp/advanced_rollouts.jsonl + ``` + +## Testing + +### Unit Tests + +```bash +# Run all unit tests +ng_test +entrypoint=resources_servers/multichallenge + +# Or run directly with pytest for more detail +cd resources_servers/multichallenge +source .venv/bin/activate +pytest -v +``` + +Tests cover: +- Verdict extraction (`[[YES]]`/`[[NO]]`) +- Context building (excluding thinking messages) +- Score aggregation (mean, min, max, all, any, weighted) + +### End-to-End Sanity Test + +1. **Start servers**: + ```bash + config_paths="resources_servers/multichallenge/configs/multichallenge.yaml,responses_api_models/vllm_model/configs/vllm_model.yaml" + ng_run "+config_paths=[${config_paths}]" + ``` + +2. **In another terminal, run on example data**: + ```bash + ng_collect_rollouts \ + +agent_name=multichallenge_simple_agent \ + +input_jsonl_fpath=resources_servers/multichallenge/data/example.jsonl \ + +output_jsonl_fpath=/tmp/multichallenge_rollouts.jsonl \ + +limit=3 + ``` + +3. **View results**: + ```bash + cat /tmp/multichallenge_rollouts.jsonl | python -c " + import json, sys + for line in sys.stdin: + d = json.loads(line) + print(f\"Reward: {d.get('reward')} | Passed: {d.get('num_passed')}/{d.get('num_total')}\") + " + ``` + +## Configuration + +### Basic Setup + +```yaml +multichallenge: + resources_servers: + multichallenge: + entrypoint: app.py + + # Judge model configuration + judge_model_server: + type: responses_api_models + name: policy_model # or a dedicated judge model + + # Judge request parameters + judge_responses_create_params: + input: [] + max_output_tokens: 512 + temperature: 0.0 + + # Score aggregation: mean | min | max | all | any | weighted + aggregation_mode: mean +``` + +### Aggregation Modes + +| Mode | Description | +|------|-------------| +| `mean` | Average of all rubric scores | +| `min` | Minimum score (strictest) | +| `max` | Maximum score (most lenient) | +| `all` | All items must pass (binary: 0 or 1) | +| `any` | Any item passes (binary: 0 or 1) | +| `weighted` | Weighted average using rubric item weights | + +### Custom Judge Prompt + +You can customize the judge prompt template: + +```yaml +judge_prompt_template: |- + You are evaluating whether a model's response meets a specific criterion. + + CONVERSATION CONTEXT: + {context} + + MODEL'S FINAL RESPONSE: + {response} + + EVALUATION QUESTION: + {question} + + EXPECTED ANSWER: {pass_criteria} + + Respond with [[YES]] or [[NO]]. +``` + +Placeholders: +- `{context}`: Full conversation history +- `{response}`: The model's final response +- `{question}`: The rubric evaluation question +- `{pass_criteria}`: Expected answer (usually "YES") + +## Data Format + +### Raw JSON Format (Input) + +Each task file contains: + +```json +{ + "metadata": { + "taskId": 12345, + "topic": "Education & Research", + "challenge": "Inference Memory" + }, + "system": "Optional system prompt", + "messages": [ + {"role": "user", "content": "..."}, + {"role": "thinking", "content": "..."}, + {"role": "assistant", "content": "..."} + ], + "rubric": [ + { + "question": "Did the model correctly remember X?", + "pass_criteria": "YES" + } + ] +} +``` + +### Preprocessed JSONL Format (Output) + +Each line contains: + +```json +{ + "uuid": "12345", + "task_id": 12345, + "responses_create_params": { + "input": [{"role": "user", "content": "..."}] + }, + "rubric": [...], + "context": "[USER]: ...\n\n[ASSISTANT]: ...", + "metadata": {...} +} +``` + +Key transformations: +- `thinking` role messages are excluded from input +- `context` is pre-formatted for the LLM judge +- `responses_create_params` wraps input for `ng_collect_rollouts` + +## File Structure + +``` +multichallenge/ +├── app.py # Main server implementation +├── dataset_preprocess.py # JSON → JSONL converter +├── requirements.txt # Dependencies (-e nemo-gym[dev]) +├── README.md # This file +├── .gitignore # Excludes data from git +├── configs/ +│ └── multichallenge.yaml # Server + agent configuration +├── data/ +│ ├── example.jsonl # Example data (3 tasks, committed) +│ ├── advanced/ # Raw JSON files (ignored) +│ ├── vanilla/ # Raw JSON files (ignored) +│ ├── advanced.jsonl # Preprocessed (generated, ignored) +│ ├── vanilla.jsonl # Preprocessed (generated, ignored) +│ ├── .gitignore +│ └── README.md +└── tests/ + ├── __init__.py + └── test_multichallenge.py +``` + +## API Endpoints + +- `POST /verify` - Evaluate a model response against the rubric +- `POST /seed_session` - Initialize a new session + +### Verify Response + +```json +{ + "reward": 0.75, + "generated_response": "...", + "rubric_evaluations": [ + { + "question": "...", + "pass_criteria": "YES", + "verdict": "YES", + "score": 1.0 + } + ], + "num_passed": 3, + "num_total": 4, + "aggregation_mode": "mean" +} +``` + +## Example Rubric Evaluation + +Given a conversation about travel planning where the user mentioned a seafood allergy: + +```json +{ + "question": "Did the model correctly remember that the user is allergic to seafood and avoid recommending seafood dishes?", + "pass_criteria": "YES" +} +``` + +The LLM judge analyzes the model's response and returns `[[YES]]` or `[[NO]]`. + +--- + +**Note**: The default raw data path is hardcoded in `dataset_preprocess.py`: +``` +/lustre/fsw/portfolios/llmservice/users/mfathi/data/multichallenge +``` +Update `DEFAULT_RAW_DATA_DIR` in the script or use `--data-dir` to specify a different location. diff --git a/resources_servers/multichallenge/app.py b/resources_servers/multichallenge/app.py new file mode 100644 index 000000000..cb039a1a4 --- /dev/null +++ b/resources_servers/multichallenge/app.py @@ -0,0 +1,382 @@ +# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" +MultiChallenge Environment Resources Server. + +Evaluates model responses on the MultiChallenge benchmark using an LLM judge. +Each task contains a conversation context and a rubric of yes/no questions +that assess the quality of the final assistant response. + +This environment: +1. Loads tasks from configurable splits (e.g., "advanced", "vanilla") +2. Feeds conversation context to the policy model +3. Retrieves the final response (excluding thinking parts) +4. Evaluates against each rubric entry using an LLM judge +5. Aggregates scores using a configurable method +""" + +from __future__ import annotations + +import re +from enum import Enum +from typing import Any, List, Optional + +from fastapi import FastAPI +from pydantic import BaseModel, ConfigDict, Field + +from nemo_gym.base_resources_server import ( + BaseResourcesServerConfig, + BaseRunRequest, + BaseVerifyRequest, + BaseVerifyResponse, + SimpleResourcesServer, +) +from nemo_gym.config_types import ModelServerRef +from nemo_gym.openai_utils import ( + NeMoGymEasyInputMessage, + NeMoGymResponse, + NeMoGymResponseCreateParamsNonStreaming, +) + + +class AggregationMode(str, Enum): + """How to aggregate rubric scores into a final reward.""" + + # Average of all rubric scores + MEAN = "mean" + # Minimum score across all rubric items (strict) + MIN = "min" + # Maximum score across all rubric items (lenient) + MAX = "max" + # All rubric items must pass (product of binary scores) + ALL = "all" + # Any rubric item passes (max of binary scores) + ANY = "any" + # Weighted average (requires weights in rubric items) + WEIGHTED = "weighted" + + +class RubricEvaluation(BaseModel): + """Result of evaluating a single rubric item.""" + + question: str + pass_criteria: str + judge_prompt: str + judge_response: str + verdict: str # "YES" or "NO" + score: float # 1.0 for pass, 0.0 for fail + weight: float = 1.0 + + +class MultiChallengeConfig(BaseResourcesServerConfig): + """Configuration for the MultiChallenge environment server.""" + + name: str = "multichallenge" + + # Reference to the judge model server + judge_model_server: ModelServerRef = Field(description="Reference to the model server used as the LLM judge") + + # Parameters for judge requests + judge_responses_create_params: NeMoGymResponseCreateParamsNonStreaming = Field( + description="Base parameters for judge model requests" + ) + + # Aggregation mode for combining rubric scores + aggregation_mode: AggregationMode = Field( + default=AggregationMode.MEAN, description="How to aggregate scores from multiple rubric items" + ) + + # Template for the judge prompt + judge_prompt_template: str = Field( + default="""You are evaluating whether a model's response meets a specific criterion. + +CONVERSATION CONTEXT: +{context} + +MODEL'S FINAL RESPONSE: +{response} + +EVALUATION QUESTION: +{question} + +EXPECTED ANSWER: {pass_criteria} + +Does the model's response satisfy the criterion described in the evaluation question? +Analyze carefully, then respond with exactly [[YES]] or [[NO]] on the last line.""", + description="Template for the judge evaluation prompt", + ) + + # System message for the judge + judge_system_message: Optional[str] = Field( + default="You are a precise evaluator. Assess responses objectively based on the given criteria.", + description="Optional system message for the judge", + ) + + # Whether to run rubric evaluations in parallel + parallel_evaluation: bool = Field(default=True, description="Whether to evaluate rubric items in parallel") + + # Labels for verdict extraction + yes_label: str = Field(default="[[YES]]", description="Label indicating YES verdict") + no_label: str = Field(default="[[NO]]", description="Label indicating NO verdict") + + +class MultiChallengeRunRequest(BaseRunRequest): + """Run request payload for MultiChallenge tasks.""" + + model_config = ConfigDict(extra="allow") + + uuid: Optional[str | int] = None + task_id: Optional[int] = None + rubric: Optional[List[dict]] = None + context: Optional[str] = None + metadata: Optional[dict[str, Any]] = None + + +class MultiChallengeVerifyRequest(MultiChallengeRunRequest, BaseVerifyRequest): + """Verify request that includes the model's response.""" + + pass + + +class MultiChallengeVerifyResponse(BaseVerifyResponse): + """Response with detailed rubric evaluations.""" + + model_config = ConfigDict(extra="allow") + + context: str + generated_response: str + rubric_evaluations: List[RubricEvaluation] + aggregation_mode: str + num_passed: int + num_total: int + + +def _extract_text_from_response(response: NeMoGymResponse, exclude_thinking: bool = True) -> str: + """Extract text content from the last assistant message, optionally excluding thinking.""" + for output in reversed(response.output): + if getattr(output, "type", None) == "message" and getattr(output, "role", None) == "assistant": + content = getattr(output, "content", None) + if isinstance(content, list): + texts = [] + for c in content: + text = getattr(c, "text", None) + if isinstance(text, str): + texts.append(text) + full_text = "\n".join(texts).strip() + elif isinstance(content, str): + full_text = content.strip() + else: + continue + + if exclude_thinking: + # Remove ... blocks + full_text = re.sub(r".*?", "", full_text, flags=re.DOTALL) + # Also remove ... blocks + full_text = re.sub(r".*?", "", full_text, flags=re.DOTALL) + + return full_text.strip() + return "" + + +def _build_context_from_messages(messages: List[dict], exclude_thinking: bool = True) -> str: + """Build a readable context string from the message history.""" + context_parts = [] + for msg in messages: + role = msg.get("role", "unknown") + content = msg.get("content", "") + + # Skip thinking messages + if exclude_thinking and role == "thinking": + continue + + role_label = role.upper() + context_parts.append(f"[{role_label}]: {content}") + + return "\n\n".join(context_parts) + + +def _extract_verdict(response_text: str, yes_label: str, no_label: str) -> str: + """Extract YES/NO verdict from judge response.""" + # Look for the labels in the response + yes_pos = response_text.rfind(yes_label) + no_pos = response_text.rfind(no_label) + + if yes_pos < 0 and no_pos < 0: + # Fallback: look for plain YES/NO at end of response + lines = response_text.strip().split("\n") + last_line = lines[-1].strip().upper() if lines else "" + if "YES" in last_line: + return "YES" + elif "NO" in last_line: + return "NO" + return "NO" # Default to NO if unclear + + # Return whichever appears last (most authoritative) + if yes_pos > no_pos: + return "YES" + return "NO" + + +class MultiChallengeServer(SimpleResourcesServer): + """MultiChallenge evaluation server.""" + + config: MultiChallengeConfig + + def setup_webserver(self) -> FastAPI: + app = super().setup_webserver() + return app + + async def verify(self, body: MultiChallengeVerifyRequest) -> MultiChallengeVerifyResponse: + """Verify model response against the rubric using LLM judge.""" + + # Extract the generated response (without thinking) + generated_response = _extract_text_from_response(body.response, exclude_thinking=True) + + # Get context from the request or build from messages if available + context = body.context or "" + if not context and body.metadata and "messages" in body.metadata: + context = _build_context_from_messages(body.metadata["messages"]) + + # Get rubric from request + rubric = body.rubric or [] + if not rubric and body.metadata and "rubric" in body.metadata: + rubric = body.metadata["rubric"] + + # Evaluate each rubric item + if self.config.parallel_evaluation and len(rubric) > 1: + import asyncio + + evaluations = await asyncio.gather( + *[self._evaluate_rubric_item(item, context, generated_response) for item in rubric] + ) + else: + evaluations = [] + for item in rubric: + eval_result = await self._evaluate_rubric_item(item, context, generated_response) + evaluations.append(eval_result) + + # Aggregate scores + reward = self._aggregate_scores(evaluations) + num_passed = sum(1 for e in evaluations if e.score >= 0.99) + + # Build response + payload = body.model_dump() + payload.pop("context", None) + payload.pop("rubric", None) + + return MultiChallengeVerifyResponse( + **payload, + reward=reward, + context=context, + generated_response=generated_response, + rubric_evaluations=evaluations, + aggregation_mode=self.config.aggregation_mode.value, + num_passed=num_passed, + num_total=len(evaluations), + ) + + async def _evaluate_rubric_item(self, item: dict, context: str, response: str) -> RubricEvaluation: + """Evaluate a single rubric item using the LLM judge.""" + + question = item.get("question", "") + pass_criteria = item.get("pass_criteria", "YES") + weight = item.get("weight", 1.0) + + # Format the judge prompt + judge_prompt = self.config.judge_prompt_template.format( + context=context, + response=response, + question=question, + pass_criteria=pass_criteria, + ) + + # Build messages for judge + msgs: List[NeMoGymEasyInputMessage] = [] + if self.config.judge_system_message: + msgs.append(NeMoGymEasyInputMessage(role="system", content=self.config.judge_system_message)) + msgs.append(NeMoGymEasyInputMessage(role="user", content=judge_prompt)) + + # Create request parameters + request_params = self.config.judge_responses_create_params.model_copy(deep=True) + request_params.input = msgs + + # Call judge model + response_obj = await self.server_client.post( + server_name=self.config.judge_model_server.name, + url_path="/v1/responses", + json=request_params, + ) + judge_response = NeMoGymResponse.model_validate(await response_obj.json()) + judge_text = _extract_text_from_response(judge_response, exclude_thinking=True) + + # Extract verdict + verdict = _extract_verdict(judge_text, self.config.yes_label, self.config.no_label) + + # Score based on whether verdict matches expected criteria + if pass_criteria.upper() == "YES": + score = 1.0 if verdict == "YES" else 0.0 + elif pass_criteria.upper() == "NO": + score = 1.0 if verdict == "NO" else 0.0 + else: + # For other criteria, treat YES as success + score = 1.0 if verdict == "YES" else 0.0 + + return RubricEvaluation( + question=question, + pass_criteria=pass_criteria, + judge_prompt=judge_prompt, + judge_response=judge_text, + verdict=verdict, + score=score, + weight=weight, + ) + + def _aggregate_scores(self, evaluations: List[RubricEvaluation]) -> float: + """Aggregate rubric scores into final reward.""" + if not evaluations: + return 0.0 + + scores = [e.score for e in evaluations] + weights = [e.weight for e in evaluations] + + mode = self.config.aggregation_mode + + if mode == AggregationMode.MEAN: + return sum(scores) / len(scores) + + elif mode == AggregationMode.MIN: + return min(scores) + + elif mode == AggregationMode.MAX: + return max(scores) + + elif mode == AggregationMode.ALL: + return 1.0 if all(s >= 0.99 for s in scores) else 0.0 + + elif mode == AggregationMode.ANY: + return 1.0 if any(s >= 0.99 for s in scores) else 0.0 + + elif mode == AggregationMode.WEIGHTED: + total_weight = sum(weights) + if total_weight == 0: + return 0.0 + weighted_sum = sum(s * w for s, w in zip(scores, weights)) + return weighted_sum / total_weight + + return 0.0 + + +if __name__ == "__main__": + MultiChallengeServer.run_webserver() diff --git a/resources_servers/multichallenge/configs/multichallenge.yaml b/resources_servers/multichallenge/configs/multichallenge.yaml new file mode 100644 index 000000000..7cb7ec74c --- /dev/null +++ b/resources_servers/multichallenge/configs/multichallenge.yaml @@ -0,0 +1,72 @@ +multichallenge: + resources_servers: + multichallenge: + entrypoint: app.py + judge_model_server: + type: responses_api_models + name: policy_model + judge_responses_create_params: + input: [] + max_output_tokens: 8192 + temperature: 0.7 + top_p: 0.8 + aggregation_mode: mean + parallel_evaluation: true + judge_system_message: You are a precise evaluator. Assess responses objectively + based on the given criteria. Analyze the response carefully against the evaluation + question. + judge_prompt_template: 'You are evaluating whether a model''s response meets + a specific criterion. + + + CONVERSATION CONTEXT: + + {context} + + + MODEL''S FINAL RESPONSE: + + {response} + + + EVALUATION QUESTION: + + {question} + + + EXPECTED ANSWER: {pass_criteria} + + + Does the model''s response satisfy the criterion described in the evaluation + question? + + Think step by step, then respond with exactly [[YES]] or [[NO]] on the last + line.' + yes_label: '[[YES]]' + no_label: '[[NO]]' + domain: knowledge + description: MultiChallenge benchmark evaluation with LLM judge + verified: false +multichallenge_simple_agent: + responses_api_agents: + simple_agent: + entrypoint: app.py + resources_server: + type: resources_servers + name: multichallenge + model_server: + type: responses_api_models + name: policy_model + datasets: + - name: multichallenge_example + type: example + license: Apache 2.0 + jsonl_fpath: resources_servers/multichallenge/data/example.jsonl + - name: multichallenge_advanced + type: train + license: TBD + jsonl_fpath: resources_servers/multichallenge/data/advanced.jsonl + - name: multichallenge_vanilla + type: train + license: TBD + jsonl_fpath: resources_servers/multichallenge/data/vanilla.jsonl diff --git a/resources_servers/multichallenge/configs/multichallenge_nrl.yaml b/resources_servers/multichallenge/configs/multichallenge_nrl.yaml new file mode 100644 index 000000000..7b67aa0f0 --- /dev/null +++ b/resources_servers/multichallenge/configs/multichallenge_nrl.yaml @@ -0,0 +1,72 @@ +multichallenge: + resources_servers: + multichallenge: + entrypoint: app.py + judge_model_server: + type: responses_api_models + name: policy_model + judge_responses_create_params: + input: [] + max_output_tokens: 512 + temperature: 1.0 + top_p: 1.0 + aggregation_mode: mean + parallel_evaluation: true + judge_system_message: You are a precise evaluator. Assess responses objectively + based on the given criteria. Analyze the response carefully against the evaluation + question. + judge_prompt_template: 'You are evaluating whether a model''s response meets + a specific criterion. + + + CONVERSATION CONTEXT: + + {context} + + + MODEL''S FINAL RESPONSE: + + {response} + + + EVALUATION QUESTION: + + {question} + + + EXPECTED ANSWER: {pass_criteria} + + + Does the model''s response satisfy the criterion described in the evaluation + question? + + Think step by step, then respond with exactly [[YES]] or [[NO]] on the last + line.' + yes_label: '[[YES]]' + no_label: '[[NO]]' + domain: knowledge + description: MultiChallenge benchmark evaluation with LLM judge + verified: false +multichallenge_simple_agent: + responses_api_agents: + simple_agent: + entrypoint: app.py + resources_server: + type: resources_servers + name: multichallenge + model_server: + type: responses_api_models + name: policy_model + datasets: + - name: multichallenge_example + type: example + license: Apache 2.0 + jsonl_fpath: resources_servers/multichallenge/data/example.jsonl + - name: multichallenge_advanced + type: train + license: TBD + jsonl_fpath: resources_servers/multichallenge/data/advanced.jsonl + - name: multichallenge_vanilla + type: train + license: TBD + jsonl_fpath: resources_servers/multichallenge/data/vanilla.jsonl diff --git a/resources_servers/multichallenge/data/.gitignore b/resources_servers/multichallenge/data/.gitignore new file mode 100644 index 000000000..44c53a589 --- /dev/null +++ b/resources_servers/multichallenge/data/.gitignore @@ -0,0 +1,17 @@ +# Ignore all data files in this directory +# The MultiChallenge dataset should be copied here but not committed to git + +# Ignore raw data directories +advanced/ +vanilla/ + +# Ignore preprocessed JSONL files (generated from JSON files) +advanced.jsonl +vanilla.jsonl + +# Keep these files +!.gitignore +!README.md +!example.jsonl +!example_rollouts.jsonl +!example_metrics.json diff --git a/resources_servers/multichallenge/data/README.md b/resources_servers/multichallenge/data/README.md new file mode 100644 index 000000000..d8db78eb6 --- /dev/null +++ b/resources_servers/multichallenge/data/README.md @@ -0,0 +1,131 @@ +# MultiChallenge Data Directory + +This directory contains the MultiChallenge benchmark dataset. + +## Quick Start + +```bash +# Option A: Use example data only (no setup needed) +# The example.jsonl file is ready to use for testing + +# Option B: Full dataset setup +# 1. Copy raw data +cp -r /path/to/multichallenge/advanced ./advanced +cp -r /path/to/multichallenge/vanilla ./vanilla + +# 2. Preprocess to JSONL (run from parent directory) +cd .. +python dataset_preprocess.py +``` + +## Directory Structure + +``` +data/ +├── example.jsonl # Example dataset (3 tasks, committed to git) +├── advanced/ # Raw JSON task files (113 tasks, ignored) +│ └── *.json +├── vanilla/ # Raw JSON task files (111 tasks, ignored) +│ └── *.json +├── advanced.jsonl # Preprocessed dataset (generated, ignored) +├── vanilla.jsonl # Preprocessed dataset (generated, ignored) +├── .gitignore # Excludes data files from git +└── README.md # This file +``` + +## Example Dataset + +The `example.jsonl` file contains 3 synthetic tasks for quick testing: + +| # | Challenge | Rubric Items | Tests | +|---|-----------|--------------|-------| +| 1 | Memory Retention | 2 | Peanut allergy recall, name usage | +| 2 | Preference Update | 3 | Pescatarian diet, preference correction | +| 3 | Context Tracking | 2 | Presentation intro, climate change topic | + +**Usage:** +```bash +ng_collect_rollouts \ + +agent_name=multichallenge_simple_agent \ + +input_jsonl_fpath=resources_servers/multichallenge/data/example.jsonl \ + +output_jsonl_fpath=/tmp/test_rollouts.jsonl +``` + +## Raw JSON Format + +Each task JSON file contains: + +```json +{ + "metadata": { + "taskId": 12345, + "topic": "Travel & Transportation", + "challenge": "Inference Memory", + "persona": "..." + }, + "system": "Optional system prompt", + "messages": [ + {"role": "user", "content": "..."}, + {"role": "thinking", "content": "..."}, + {"role": "assistant", "content": "..."}, + {"role": "user", "content": "..."} + ], + "rubric": [ + { + "question": "Did the model correctly remember X?", + "pass_criteria": "YES" + } + ], + "ground_truth_answer": "...", + "model_responses": [...] // Ignored by this environment +} +``` + +## Preprocessed JSONL Format + +Each line in the JSONL file: + +```json +{ + "uuid": "12345", + "task_id": 12345, + "responses_create_params": { + "input": [ + {"role": "system", "content": "..."}, + {"role": "user", "content": "..."}, + {"role": "assistant", "content": "..."}, + {"role": "user", "content": "..."} + ] + }, + "rubric": [...], + "context": "[USER]: ...\n\n[ASSISTANT]: ...", + "metadata": {...} +} +``` + +**Key transformations:** +- `thinking` role messages are **excluded** from `responses_create_params.input` +- `context` is a pre-formatted string for the LLM judge (also excludes thinking) +- `responses_create_params` wrapper is required by `ng_collect_rollouts` +- `metadata` preserves full original data for reference + +## Regenerating JSONL Files + +If you modify the raw data or preprocessing logic: + +```bash +python dataset_preprocess.py --data-dir ./data --splits advanced vanilla +``` + +**Options:** +- `--data-dir`: Directory containing split subdirectories (default: `./data`) +- `--output-dir`: Where to write JSONL files (default: same as data-dir) +- `--splits`: Which splits to process (default: `advanced vanilla`) + +## Git Ignored Files + +The following are excluded from version control: +- `advanced/` and `vanilla/` directories (raw data) +- `advanced.jsonl` and `vanilla.jsonl` (preprocessed data) + +The `example.jsonl` file **is committed** for testing purposes. diff --git a/resources_servers/multichallenge/data/example.jsonl b/resources_servers/multichallenge/data/example.jsonl new file mode 100644 index 000000000..7593a7dcc --- /dev/null +++ b/resources_servers/multichallenge/data/example.jsonl @@ -0,0 +1,5 @@ +{"uuid": "example_1", "task_id": 1, "agent_ref": {"type": "responses_api_agents", "name": "multichallenge_simple_agent"}, "responses_create_params": {"input": [{"role": "user", "content": "Hi! My name is Alex and I'm allergic to peanuts. Can you remember that?"}, {"role": "assistant", "content": "Of course, Alex! I've noted that you have a peanut allergy. I'll make sure to keep that in mind for any food-related suggestions. Is there anything else you'd like me to know about your dietary restrictions?"}, {"role": "user", "content": "No, that's all. Can you suggest a snack for me?"}]}, "rubric": [{"question": "Did the model correctly remember and avoid suggesting peanuts or peanut-containing snacks?", "pass_criteria": "YES"}, {"question": "Did the model address the user by their name (Alex)?", "pass_criteria": "YES"}], "context": "[USER]: Hi! My name is Alex and I'm allergic to peanuts. Can you remember that?\n\n[ASSISTANT]: Of course, Alex! I've noted that you have a peanut allergy. I'll make sure to keep that in mind for any food-related suggestions. Is there anything else you'd like me to know about your dietary restrictions?\n\n[USER]: No, that's all. Can you suggest a snack for me?", "metadata": {"topic": "Food & Dietary", "challenge": "Memory Retention"}} +{"uuid": "example_2", "task_id": 2, "agent_ref": {"type": "responses_api_agents", "name": "multichallenge_simple_agent"}, "responses_create_params": {"input": [{"role": "system", "content": "You are a helpful travel assistant. Always be concise and practical."}, {"role": "user", "content": "I'm planning a trip to Japan next month. I don't eat meat."}, {"role": "assistant", "content": "Great choice! Japan has wonderful vegetarian options. I'll note that you're vegetarian for restaurant recommendations."}, {"role": "user", "content": "Actually, I eat fish, just not meat."}, {"role": "assistant", "content": "Understood! So you're pescatarian - you eat fish but not meat. That's perfect for Japan since seafood is a major part of the cuisine. I've updated my notes."}, {"role": "user", "content": "What are some must-try dishes for me in Tokyo?"}]}, "rubric": [{"question": "Did the model recommend dishes that include fish/seafood but exclude meat?", "pass_criteria": "YES"}, {"question": "Did the model correctly remember the user is pescatarian (not vegetarian)?", "pass_criteria": "YES"}, {"question": "Were the recommendations specific to Tokyo?", "pass_criteria": "YES"}], "context": "[SYSTEM]: You are a helpful travel assistant. Always be concise and practical.\n\n[USER]: I'm planning a trip to Japan next month. I don't eat meat.\n\n[ASSISTANT]: Great choice! Japan has wonderful vegetarian options. I'll note that you're vegetarian for restaurant recommendations.\n\n[USER]: Actually, I eat fish, just not meat.\n\n[ASSISTANT]: Understood! So you're pescatarian - you eat fish but not meat. That's perfect for Japan since seafood is a major part of the cuisine. I've updated my notes.\n\n[USER]: What are some must-try dishes for me in Tokyo?", "metadata": {"topic": "Travel", "challenge": "Preference Update"}} +{"uuid": "example_3", "task_id": 3, "agent_ref": {"type": "responses_api_agents", "name": "multichallenge_simple_agent"}, "responses_create_params": {"input": [{"role": "user", "content": "I need help with my project. The deadline is Friday and today is Monday."}, {"role": "assistant", "content": "You have 4 days until your Friday deadline. Let's break this down - what's the project about and what have you completed so far?"}, {"role": "user", "content": "It's a presentation about climate change. I've done the research but haven't started the slides yet."}, {"role": "assistant", "content": "Good progress on research! For a 4-day timeline, I suggest: Day 1-2 (Mon-Tue): Create outline and draft slides. Day 3 (Wed): Add visuals and refine content. Day 4 (Thu): Practice and final polish. Would you like help with any specific section?"}, {"role": "user", "content": "Yes, can you help me structure the introduction?"}]}, "rubric": [{"question": "Did the model provide advice relevant to creating a presentation introduction?", "pass_criteria": "YES"}, {"question": "Did the model's response acknowledge the climate change topic?", "pass_criteria": "YES"}], "context": "[USER]: I need help with my project. The deadline is Friday and today is Monday.\n\n[ASSISTANT]: You have 4 days until your Friday deadline. Let's break this down - what's the project about and what have you completed so far?\n\n[USER]: It's a presentation about climate change. I've done the research but haven't started the slides yet.\n\n[ASSISTANT]: Good progress on research! For a 4-day timeline, I suggest: Day 1-2 (Mon-Tue): Create outline and draft slides. Day 3 (Wed): Add visuals and refine content. Day 4 (Thu): Practice and final polish. Would you like help with any specific section?\n\n[USER]: Yes, can you help me structure the introduction?", "metadata": {"topic": "Productivity", "challenge": "Context Tracking"}} +{"uuid": "example_3", "task_id": 3, "agent_ref": {"type": "responses_api_agents", "name": "multichallenge_simple_agent"}, "responses_create_params": {"input": [{"role": "user", "content": "I need help with my project. The deadline is Friday and today is Monday."}, {"role": "assistant", "content": "You have 4 days until your Friday deadline. Let's break this down - what's the project about and what have you completed so far?"}, {"role": "user", "content": "It's a presentation about climate change. I've done the research but haven't started the slides yet."}, {"role": "assistant", "content": "Good progress on research! For a 4-day timeline, I suggest: Day 1-2 (Mon-Tue): Create outline and draft slides. Day 3 (Wed): Add visuals and refine content. Day 4 (Thu): Practice and final polish. Would you like help with any specific section?"}, {"role": "user", "content": "Yes, can you help me structure the introduction?"}]}, "rubric": [{"question": "Did the model provide advice relevant to creating a presentation introduction?", "pass_criteria": "YES"}, {"question": "Did the model's response acknowledge the climate change topic?", "pass_criteria": "YES"}], "context": "[USER]: I need help with my project. The deadline is Friday and today is Monday.\n\n[ASSISTANT]: You have 4 days until your Friday deadline. Let's break this down - what's the project about and what have you completed so far?\n\n[USER]: It's a presentation about climate change. I've done the research but haven't started the slides yet.\n\n[ASSISTANT]: Good progress on research! For a 4-day timeline, I suggest: Day 1-2 (Mon-Tue): Create outline and draft slides. Day 3 (Wed): Add visuals and refine content. Day 4 (Thu): Practice and final polish. Would you like help with any specific section?\n\n[USER]: Yes, can you help me structure the introduction?", "metadata": {"topic": "Productivity", "challenge": "Context Tracking"}} +{"uuid": "example_3", "task_id": 3, "agent_ref": {"type": "responses_api_agents", "name": "multichallenge_simple_agent"}, "responses_create_params": {"input": [{"role": "user", "content": "I need help with my project. The deadline is Friday and today is Monday."}, {"role": "assistant", "content": "You have 4 days until your Friday deadline. Let's break this down - what's the project about and what have you completed so far?"}, {"role": "user", "content": "It's a presentation about climate change. I've done the research but haven't started the slides yet."}, {"role": "assistant", "content": "Good progress on research! For a 4-day timeline, I suggest: Day 1-2 (Mon-Tue): Create outline and draft slides. Day 3 (Wed): Add visuals and refine content. Day 4 (Thu): Practice and final polish. Would you like help with any specific section?"}, {"role": "user", "content": "Yes, can you help me structure the introduction?"}]}, "rubric": [{"question": "Did the model provide advice relevant to creating a presentation introduction?", "pass_criteria": "YES"}, {"question": "Did the model's response acknowledge the climate change topic?", "pass_criteria": "YES"}], "context": "[USER]: I need help with my project. The deadline is Friday and today is Monday.\n\n[ASSISTANT]: You have 4 days until your Friday deadline. Let's break this down - what's the project about and what have you completed so far?\n\n[USER]: It's a presentation about climate change. I've done the research but haven't started the slides yet.\n\n[ASSISTANT]: Good progress on research! For a 4-day timeline, I suggest: Day 1-2 (Mon-Tue): Create outline and draft slides. Day 3 (Wed): Add visuals and refine content. Day 4 (Thu): Practice and final polish. Would you like help with any specific section?\n\n[USER]: Yes, can you help me structure the introduction?", "metadata": {"topic": "Productivity", "challenge": "Context Tracking"}} diff --git a/resources_servers/multichallenge/data/example_metrics.json b/resources_servers/multichallenge/data/example_metrics.json new file mode 100644 index 000000000..8f60d2d9d --- /dev/null +++ b/resources_servers/multichallenge/data/example_metrics.json @@ -0,0 +1,53 @@ +{ + "name": "multichallenge_example", + "type": "example", + "jsonl_fpath": "resources_servers/multichallenge/data/example.jsonl", + "num_repeats": 1, + "gitlab_identifier": null, + "huggingface_identifier": null, + "license": "Apache 2.0", + "Number of examples": 5, + "Number of tools": { + "Total # non-null values": 0, + "Average": 0.0, + "Min": 0.0, + "Max": 0.0, + "Standard deviation": 0.0 + }, + "Json-dumped number of words (proxy for token count)": { + "Total # non-null values": 5, + "Average": 105.6, + "Min": 70.0, + "Max": 119.0, + "Standard deviation": 21.37 + }, + "Number of turns": { + "Total # non-null values": 5, + "Average": 2.8, + "Min": 2.0, + "Max": 3.0, + "Standard deviation": 0.447 + }, + "Temperature": { + "Total # non-null values": 0, + "Average": 0.0, + "Min": 0.0, + "Max": 0.0, + "Standard deviation": 0.0 + }, + "uuid": { + "unique_count": 3, + "total_count": 5 + }, + "task_id": { + "Total # non-null values": 5, + "Average": 2.4, + "Min": 1.0, + "Max": 3.0, + "Standard deviation": 0.894 + }, + "context": { + "unique_count": 3, + "total_count": 5 + } +} \ No newline at end of file diff --git a/resources_servers/multichallenge/data/example_rollouts.jsonl b/resources_servers/multichallenge/data/example_rollouts.jsonl new file mode 100644 index 000000000..7593a7dcc --- /dev/null +++ b/resources_servers/multichallenge/data/example_rollouts.jsonl @@ -0,0 +1,5 @@ +{"uuid": "example_1", "task_id": 1, "agent_ref": {"type": "responses_api_agents", "name": "multichallenge_simple_agent"}, "responses_create_params": {"input": [{"role": "user", "content": "Hi! My name is Alex and I'm allergic to peanuts. Can you remember that?"}, {"role": "assistant", "content": "Of course, Alex! I've noted that you have a peanut allergy. I'll make sure to keep that in mind for any food-related suggestions. Is there anything else you'd like me to know about your dietary restrictions?"}, {"role": "user", "content": "No, that's all. Can you suggest a snack for me?"}]}, "rubric": [{"question": "Did the model correctly remember and avoid suggesting peanuts or peanut-containing snacks?", "pass_criteria": "YES"}, {"question": "Did the model address the user by their name (Alex)?", "pass_criteria": "YES"}], "context": "[USER]: Hi! My name is Alex and I'm allergic to peanuts. Can you remember that?\n\n[ASSISTANT]: Of course, Alex! I've noted that you have a peanut allergy. I'll make sure to keep that in mind for any food-related suggestions. Is there anything else you'd like me to know about your dietary restrictions?\n\n[USER]: No, that's all. Can you suggest a snack for me?", "metadata": {"topic": "Food & Dietary", "challenge": "Memory Retention"}} +{"uuid": "example_2", "task_id": 2, "agent_ref": {"type": "responses_api_agents", "name": "multichallenge_simple_agent"}, "responses_create_params": {"input": [{"role": "system", "content": "You are a helpful travel assistant. Always be concise and practical."}, {"role": "user", "content": "I'm planning a trip to Japan next month. I don't eat meat."}, {"role": "assistant", "content": "Great choice! Japan has wonderful vegetarian options. I'll note that you're vegetarian for restaurant recommendations."}, {"role": "user", "content": "Actually, I eat fish, just not meat."}, {"role": "assistant", "content": "Understood! So you're pescatarian - you eat fish but not meat. That's perfect for Japan since seafood is a major part of the cuisine. I've updated my notes."}, {"role": "user", "content": "What are some must-try dishes for me in Tokyo?"}]}, "rubric": [{"question": "Did the model recommend dishes that include fish/seafood but exclude meat?", "pass_criteria": "YES"}, {"question": "Did the model correctly remember the user is pescatarian (not vegetarian)?", "pass_criteria": "YES"}, {"question": "Were the recommendations specific to Tokyo?", "pass_criteria": "YES"}], "context": "[SYSTEM]: You are a helpful travel assistant. Always be concise and practical.\n\n[USER]: I'm planning a trip to Japan next month. I don't eat meat.\n\n[ASSISTANT]: Great choice! Japan has wonderful vegetarian options. I'll note that you're vegetarian for restaurant recommendations.\n\n[USER]: Actually, I eat fish, just not meat.\n\n[ASSISTANT]: Understood! So you're pescatarian - you eat fish but not meat. That's perfect for Japan since seafood is a major part of the cuisine. I've updated my notes.\n\n[USER]: What are some must-try dishes for me in Tokyo?", "metadata": {"topic": "Travel", "challenge": "Preference Update"}} +{"uuid": "example_3", "task_id": 3, "agent_ref": {"type": "responses_api_agents", "name": "multichallenge_simple_agent"}, "responses_create_params": {"input": [{"role": "user", "content": "I need help with my project. The deadline is Friday and today is Monday."}, {"role": "assistant", "content": "You have 4 days until your Friday deadline. Let's break this down - what's the project about and what have you completed so far?"}, {"role": "user", "content": "It's a presentation about climate change. I've done the research but haven't started the slides yet."}, {"role": "assistant", "content": "Good progress on research! For a 4-day timeline, I suggest: Day 1-2 (Mon-Tue): Create outline and draft slides. Day 3 (Wed): Add visuals and refine content. Day 4 (Thu): Practice and final polish. Would you like help with any specific section?"}, {"role": "user", "content": "Yes, can you help me structure the introduction?"}]}, "rubric": [{"question": "Did the model provide advice relevant to creating a presentation introduction?", "pass_criteria": "YES"}, {"question": "Did the model's response acknowledge the climate change topic?", "pass_criteria": "YES"}], "context": "[USER]: I need help with my project. The deadline is Friday and today is Monday.\n\n[ASSISTANT]: You have 4 days until your Friday deadline. Let's break this down - what's the project about and what have you completed so far?\n\n[USER]: It's a presentation about climate change. I've done the research but haven't started the slides yet.\n\n[ASSISTANT]: Good progress on research! For a 4-day timeline, I suggest: Day 1-2 (Mon-Tue): Create outline and draft slides. Day 3 (Wed): Add visuals and refine content. Day 4 (Thu): Practice and final polish. Would you like help with any specific section?\n\n[USER]: Yes, can you help me structure the introduction?", "metadata": {"topic": "Productivity", "challenge": "Context Tracking"}} +{"uuid": "example_3", "task_id": 3, "agent_ref": {"type": "responses_api_agents", "name": "multichallenge_simple_agent"}, "responses_create_params": {"input": [{"role": "user", "content": "I need help with my project. The deadline is Friday and today is Monday."}, {"role": "assistant", "content": "You have 4 days until your Friday deadline. Let's break this down - what's the project about and what have you completed so far?"}, {"role": "user", "content": "It's a presentation about climate change. I've done the research but haven't started the slides yet."}, {"role": "assistant", "content": "Good progress on research! For a 4-day timeline, I suggest: Day 1-2 (Mon-Tue): Create outline and draft slides. Day 3 (Wed): Add visuals and refine content. Day 4 (Thu): Practice and final polish. Would you like help with any specific section?"}, {"role": "user", "content": "Yes, can you help me structure the introduction?"}]}, "rubric": [{"question": "Did the model provide advice relevant to creating a presentation introduction?", "pass_criteria": "YES"}, {"question": "Did the model's response acknowledge the climate change topic?", "pass_criteria": "YES"}], "context": "[USER]: I need help with my project. The deadline is Friday and today is Monday.\n\n[ASSISTANT]: You have 4 days until your Friday deadline. Let's break this down - what's the project about and what have you completed so far?\n\n[USER]: It's a presentation about climate change. I've done the research but haven't started the slides yet.\n\n[ASSISTANT]: Good progress on research! For a 4-day timeline, I suggest: Day 1-2 (Mon-Tue): Create outline and draft slides. Day 3 (Wed): Add visuals and refine content. Day 4 (Thu): Practice and final polish. Would you like help with any specific section?\n\n[USER]: Yes, can you help me structure the introduction?", "metadata": {"topic": "Productivity", "challenge": "Context Tracking"}} +{"uuid": "example_3", "task_id": 3, "agent_ref": {"type": "responses_api_agents", "name": "multichallenge_simple_agent"}, "responses_create_params": {"input": [{"role": "user", "content": "I need help with my project. The deadline is Friday and today is Monday."}, {"role": "assistant", "content": "You have 4 days until your Friday deadline. Let's break this down - what's the project about and what have you completed so far?"}, {"role": "user", "content": "It's a presentation about climate change. I've done the research but haven't started the slides yet."}, {"role": "assistant", "content": "Good progress on research! For a 4-day timeline, I suggest: Day 1-2 (Mon-Tue): Create outline and draft slides. Day 3 (Wed): Add visuals and refine content. Day 4 (Thu): Practice and final polish. Would you like help with any specific section?"}, {"role": "user", "content": "Yes, can you help me structure the introduction?"}]}, "rubric": [{"question": "Did the model provide advice relevant to creating a presentation introduction?", "pass_criteria": "YES"}, {"question": "Did the model's response acknowledge the climate change topic?", "pass_criteria": "YES"}], "context": "[USER]: I need help with my project. The deadline is Friday and today is Monday.\n\n[ASSISTANT]: You have 4 days until your Friday deadline. Let's break this down - what's the project about and what have you completed so far?\n\n[USER]: It's a presentation about climate change. I've done the research but haven't started the slides yet.\n\n[ASSISTANT]: Good progress on research! For a 4-day timeline, I suggest: Day 1-2 (Mon-Tue): Create outline and draft slides. Day 3 (Wed): Add visuals and refine content. Day 4 (Thu): Practice and final polish. Would you like help with any specific section?\n\n[USER]: Yes, can you help me structure the introduction?", "metadata": {"topic": "Productivity", "challenge": "Context Tracking"}} diff --git a/resources_servers/multichallenge/dataset_preprocess.py b/resources_servers/multichallenge/dataset_preprocess.py new file mode 100644 index 000000000..d653fb611 --- /dev/null +++ b/resources_servers/multichallenge/dataset_preprocess.py @@ -0,0 +1,246 @@ +# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +#!/usr/bin/env python3 +""" +Preprocesses MultiChallenge dataset to the format required by NeMo-Gym. + +Supports two input modes: +1. JSONL mode (default): Reads from pre-compiled .jsonl files + Input: data/{split}.jsonl -> Output: data/{split}_preprocessed.jsonl + +2. JSON directory mode: Reads from individual JSON files + Input: data/{split}/*.json -> Output: data/{split}.jsonl + +Each output line contains the task data formatted for the simple_agent. +""" + +import argparse +import json +from pathlib import Path +from typing import Any + + +# Hardcoded path for raw multichallenge data +DEFAULT_RAW_DATA_DIR = Path("/lustre/fsw/portfolios/llmservice/users/mfathi/data/multichallenge") + + +def build_input_messages(task: dict) -> list[dict]: + """ + Build the input messages for the policy model from the task data. + + Excludes 'thinking' role messages and the final user message (which the model should respond to). + """ + messages = task.get("messages", []) + system_prompt = task.get("system", None) + + input_msgs = [] + + # Add system message if present + if system_prompt: + input_msgs.append({"role": "system", "content": system_prompt}) + + # Add all messages (the agent will handle the conversation flow) + for msg in messages: + role = msg.get("role", "") + content = msg.get("content", "") + + # Skip thinking messages - these shouldn't be sent to the policy model + if role == "thinking": + continue + + input_msgs.append({"role": role, "content": content}) + + return input_msgs + + +def build_context_string(task: dict) -> str: + """Build a readable context string from messages for the judge.""" + messages = task.get("messages", []) + system_prompt = task.get("system", None) + + context_parts = [] + + if system_prompt: + context_parts.append(f"[SYSTEM]: {system_prompt}") + + for msg in messages: + role = msg.get("role", "unknown") + content = msg.get("content", "") + + # Skip thinking messages + if role == "thinking": + continue + + role_label = role.upper() + context_parts.append(f"[{role_label}]: {content}") + + return "\n\n".join(context_parts) + + +def process_task(task: dict, fallback_id: str = "unknown") -> dict[str, Any]: + """Process a single task dict into the preprocessed JSONL format.""" + metadata = task.get("metadata", {}) + task_id = metadata.get("taskId", fallback_id) + + # Build the record for JSONL + record = { + "uuid": str(task_id), + "task_id": task_id, + # Agent reference - tells NeMo-Gym which agent to route this to + "agent_ref": { + "type": "responses_api_agents", + "name": "multichallenge_simple_agent", + }, + # Input messages wrapped in responses_create_params (required by ng_collect_rollouts) + "responses_create_params": { + "input": build_input_messages(task), + }, + # Rubric for evaluation + "rubric": task.get("rubric", []), + # Pre-built context string for the judge + "context": build_context_string(task), + # Full metadata + "metadata": { + **metadata, + "messages": task.get("messages", []), + "system": task.get("system", None), + "ground_truth_answer": task.get("ground_truth_answer", None), + }, + } + + return record + + +def process_task_file(filepath: Path) -> dict[str, Any]: + """Process a single task JSON file into JSONL format.""" + with open(filepath, "r", encoding="utf-8") as f: + task = json.load(f) + return process_task(task, fallback_id=filepath.stem) + + +def process_jsonl_file(input_file: Path, output_file: Path) -> int: + """Process a JSONL file where each line is a task.""" + count = 0 + errors = 0 + + print(f"Processing JSONL file: {input_file}") + + with open(input_file, "r", encoding="utf-8") as in_f, open(output_file, "w", encoding="utf-8") as out_f: + for line_num, line in enumerate(in_f, start=1): + line = line.strip() + if not line: + continue + try: + task = json.loads(line) + record = process_task(task, fallback_id=f"line_{line_num}") + out_f.write(json.dumps(record, ensure_ascii=False) + "\n") + count += 1 + except json.JSONDecodeError as e: + print(f" Warning: Invalid JSON on line {line_num}: {e}") + errors += 1 + except Exception as e: + print(f" Error processing line {line_num}: {e}") + errors += 1 + + print(f" Wrote {count} records to {output_file}" + (f" ({errors} errors)" if errors else "")) + return count + + +def process_split_jsonl(data_dir: Path, split: str, output_dir: Path) -> int: + """Process a split from a JSONL file.""" + input_file = data_dir / f"{split}.jsonl" + if not input_file.exists(): + print(f"Warning: JSONL file not found: {input_file}") + return 0 + + output_file = output_dir / f"{split}.jsonl" + return process_jsonl_file(input_file, output_file) + + +def process_split_json_dir(data_dir: Path, split: str, output_dir: Path) -> int: + """Process all JSON files in a split directory.""" + split_dir = data_dir / split + if not split_dir.exists(): + print(f"Warning: Split directory not found: {split_dir}") + return 0 + + output_file = output_dir / f"{split}.jsonl" + count = 0 + + json_files = sorted(split_dir.glob("*.json")) + print(f"Processing {len(json_files)} files from {split}...") + + with open(output_file, "w", encoding="utf-8") as out_f: + for filepath in json_files: + try: + record = process_task_file(filepath) + out_f.write(json.dumps(record, ensure_ascii=False) + "\n") + count += 1 + except Exception as e: + print(f"Error processing {filepath}: {e}") + + print(f"Wrote {count} records to {output_file}") + return count + + +def main(): + parser = argparse.ArgumentParser(description="Convert MultiChallenge data to NeMo-Gym JSONL format") + parser.add_argument( + "--data-dir", + type=Path, + default=DEFAULT_RAW_DATA_DIR, + help=f"Directory containing the data (default: {DEFAULT_RAW_DATA_DIR})", + ) + parser.add_argument( + "--output-dir", + type=Path, + default=Path(__file__).parent / "data", + help="Output directory for preprocessed JSONL files (default: ./data)", + ) + parser.add_argument( + "--splits", + nargs="+", + default=["advanced", "vanilla"], + help="Splits to process (default: advanced vanilla)", + ) + parser.add_argument( + "--mode", + choices=["jsonl", "json-dir"], + default="jsonl", + help="Input mode: 'jsonl' reads {split}.jsonl files, 'json-dir' reads {split}/*.json directories (default: jsonl)", + ) + args = parser.parse_args() + + args.output_dir.mkdir(parents=True, exist_ok=True) + + print(f"Input directory: {args.data_dir}") + print(f"Output directory: {args.output_dir}") + print(f"Mode: {args.mode}") + print(f"Splits: {args.splits}") + print() + + total = 0 + for split in args.splits: + if args.mode == "jsonl": + total += process_split_jsonl(args.data_dir, split, args.output_dir) + else: + total += process_split_json_dir(args.data_dir, split, args.output_dir) + + print(f"\nTotal: {total} records processed") + + +if __name__ == "__main__": + main() diff --git a/resources_servers/multichallenge/requirements.txt b/resources_servers/multichallenge/requirements.txt new file mode 100644 index 000000000..00ed83213 --- /dev/null +++ b/resources_servers/multichallenge/requirements.txt @@ -0,0 +1 @@ +-e nemo-gym[dev] @ ../../ diff --git a/resources_servers/multichallenge/tests/__init__.py b/resources_servers/multichallenge/tests/__init__.py new file mode 100644 index 000000000..1a8431c3e --- /dev/null +++ b/resources_servers/multichallenge/tests/__init__.py @@ -0,0 +1,2 @@ +# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 diff --git a/resources_servers/multichallenge/tests/test_multichallenge.py b/resources_servers/multichallenge/tests/test_multichallenge.py new file mode 100644 index 000000000..37bbba7d7 --- /dev/null +++ b/resources_servers/multichallenge/tests/test_multichallenge.py @@ -0,0 +1,136 @@ +# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import pytest + +from resources_servers.multichallenge.app import ( + AggregationMode, + MultiChallengeConfig, + RubricEvaluation, + _build_context_from_messages, + _extract_verdict, +) + + +class TestMultiChallenge: + """Tests for MultiChallenge environment utilities.""" + + def test_extract_verdict_yes(self): + """Test extracting YES verdict.""" + response = "After analysis, the model correctly addressed the user's allergy. [[YES]]" + verdict = _extract_verdict(response, "[[YES]]", "[[NO]]") + assert verdict == "YES" + + def test_extract_verdict_no(self): + """Test extracting NO verdict.""" + response = "The model failed to remember the allergy. [[NO]]" + verdict = _extract_verdict(response, "[[YES]]", "[[NO]]") + assert verdict == "NO" + + def test_extract_verdict_fallback(self): + """Test fallback when no label present.""" + response = "The model did well.\nYES" + verdict = _extract_verdict(response, "[[YES]]", "[[NO]]") + assert verdict == "YES" + + def test_extract_verdict_last_wins(self): + """Test that last label wins when both present.""" + response = "Initially [[YES]] but actually [[NO]]" + verdict = _extract_verdict(response, "[[YES]]", "[[NO]]") + assert verdict == "NO" + + def test_build_context_excludes_thinking(self): + """Test that thinking messages are excluded from context.""" + messages = [ + {"role": "user", "content": "Hello"}, + {"role": "thinking", "content": "Processing..."}, + {"role": "assistant", "content": "Hi there!"}, + ] + context = _build_context_from_messages(messages, exclude_thinking=True) + assert "Processing" not in context + assert "[USER]: Hello" in context + assert "[ASSISTANT]: Hi there!" in context + + def test_build_context_includes_thinking(self): + """Test that thinking messages can be included.""" + messages = [ + {"role": "user", "content": "Hello"}, + {"role": "thinking", "content": "Processing..."}, + {"role": "assistant", "content": "Hi there!"}, + ] + context = _build_context_from_messages(messages, exclude_thinking=False) + assert "[THINKING]: Processing" in context + + +class TestAggregation: + """Tests for score aggregation.""" + + def create_evaluations(self, scores: list[float]) -> list[RubricEvaluation]: + """Create mock evaluations with given scores.""" + return [ + RubricEvaluation( + question=f"Q{i}", + pass_criteria="YES", + judge_prompt="...", + judge_response="...", + verdict="YES" if s >= 0.99 else "NO", + score=s, + weight=1.0, + ) + for i, s in enumerate(scores) + ] + + def test_aggregation_modes(self): + """Test various aggregation modes.""" + from unittest.mock import MagicMock + + from nemo_gym.config_types import ModelServerRef + from nemo_gym.openai_utils import NeMoGymResponseCreateParamsNonStreaming + from nemo_gym.server_utils import ServerClient + from resources_servers.multichallenge.app import MultiChallengeServer + + config = MultiChallengeConfig( + host="", + port=0, + entrypoint="", + name="test", + judge_model_server=ModelServerRef(type="responses_api_models", name="test"), + judge_responses_create_params=NeMoGymResponseCreateParamsNonStreaming(input=[]), + ) + + # Create a proper mock that passes pydantic validation + mock_client = MagicMock(spec=ServerClient) + server = MultiChallengeServer.model_construct(config=config, server_client=mock_client) + evaluations = self.create_evaluations([1.0, 0.5, 0.0]) + + # Test MEAN + config.aggregation_mode = AggregationMode.MEAN + assert server._aggregate_scores(evaluations) == pytest.approx(0.5) + + # Test MIN + config.aggregation_mode = AggregationMode.MIN + assert server._aggregate_scores(evaluations) == 0.0 + + # Test MAX + config.aggregation_mode = AggregationMode.MAX + assert server._aggregate_scores(evaluations) == 1.0 + + # Test ALL (only first passes) + config.aggregation_mode = AggregationMode.ALL + assert server._aggregate_scores(evaluations) == 0.0 + + # Test ANY (first passes) + config.aggregation_mode = AggregationMode.ANY + assert server._aggregate_scores(evaluations) == 1.0 diff --git a/resources_servers/swerl_gen/eval/singularity_utils.py b/resources_servers/swerl_gen/eval/singularity_utils.py index 3a8ed8528..0a1252f32 100644 --- a/resources_servers/swerl_gen/eval/singularity_utils.py +++ b/resources_servers/swerl_gen/eval/singularity_utils.py @@ -200,7 +200,10 @@ def _run_instance( # Using SPREAD scheduling so that Ray assigns tasks to as many distinct nodes as possible. -@ray.remote(scheduling_strategy="SPREAD") +@ray.remote( + scheduling_strategy="SPREAD", + runtime_env={"env_vars": {"PYTHONPATH": "/opt/nemo-rl/3rdparty/Gym-workspace/Gym"}}, +) def compute_score( extra_info_base64: str, patch_str: str,