diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml index 3c6cd2e0c7..225c5f9b00 100644 --- a/.github/workflows/tests.yml +++ b/.github/workflows/tests.yml @@ -91,5 +91,5 @@ jobs: docker run --rm --network=host -v /nemo_run:/nemo_run nemo-skills-sandbox-image & sleep 10 set -o pipefail # this will make sure next line returns non-0 exit code if tests fail - ns prepare_data gsm8k math-500 + ns prepare_data gsm8k math-500 hle python -m pytest tests/ -m "not gpu" --junitxml=pytest.xml --cov-report=term-missing:skip-covered --cov=nemo_skills --cov=pipeline --durations=30 -rs -s -vvv diff --git a/nemo_skills/evaluation/metrics/hleaa_metrics.py b/nemo_skills/evaluation/metrics/hleaa_metrics.py new file mode 100644 index 0000000000..386f3e64da --- /dev/null +++ b/nemo_skills/evaluation/metrics/hleaa_metrics.py @@ -0,0 +1,39 @@ +# Copyright (c) 2026, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import json +import logging + +from nemo_skills.evaluation.metrics.math_metrics import MathMetrics +from nemo_skills.utils import get_logger_name + +LOG = logging.getLogger(get_logger_name(__file__)) + + +class HLEAAMetrics(MathMetrics): + """Metrics for HLE with judge structured output for AA-compatibility.""" + + def _postprocess_judgement(self, prediction: dict) -> dict: + prediction = prediction.copy() + try: + judgement = json.loads(prediction["judgement"]) + prediction["judgement"] = "Judgement: {}".format(judgement["correct"]) + except (json.JSONDecodeError, KeyError) as e: + LOG.debug(f"Failed to parse structured output judgement: {e}") + prediction["judgement"] = "Judgement: FAILED_TO_POSTPROCESS" + return prediction + + def update(self, predictions): + preprocessed_predictions = [self._postprocess_judgement(pred) for pred in predictions] + super().update(preprocessed_predictions) diff --git a/nemo_skills/evaluation/metrics/map_metrics.py b/nemo_skills/evaluation/metrics/map_metrics.py index 41c2acab64..ad3e367d0d 100644 --- a/nemo_skills/evaluation/metrics/map_metrics.py +++ b/nemo_skills/evaluation/metrics/map_metrics.py @@ -32,6 +32,7 @@ SweBenchMetrics, ) from nemo_skills.evaluation.metrics.gradingbench_metrics import GradingBenchMetrics +from nemo_skills.evaluation.metrics.hleaa_metrics import HLEAAMetrics from nemo_skills.evaluation.metrics.icpc_metrics import ICPCMetrics from nemo_skills.evaluation.metrics.if_metrics import IFMetrics from nemo_skills.evaluation.metrics.ioi_metrics import IOIMetrics @@ -47,6 +48,7 @@ METRICS_MAP = { "math": MathMetrics, "hle": functools.partial(MathMetrics, compute_no_answer=False, answer_key="generation"), + "hle-aa": functools.partial(HLEAAMetrics, compute_no_answer=False, answer_key="generation"), "frontierscience-olympiad": functools.partial( MathMetrics, compute_no_answer=False, question_key="question", answer_key="generation" ), diff --git a/nemo_skills/inference/generate.py b/nemo_skills/inference/generate.py index bc5d2c38d5..b3f0094c1a 100644 --- a/nemo_skills/inference/generate.py +++ b/nemo_skills/inference/generate.py @@ -47,6 +47,7 @@ server_params, ) from nemo_skills.inference.model.base import EndpointType +from nemo_skills.inference.structured_outputs import STRUCTURED_OUTPUTS from nemo_skills.prompt.utils import get_prompt, get_token_count from nemo_skills.utils import ( chunk_data, @@ -221,6 +222,8 @@ class GenerationTaskConfig: eval_type: str | None = None # "lean4-proof", "math", etc. eval_config: dict = field(default_factory=dict) # Config for the evaluator + structured_output: str | None = None + def __post_init__(self): self._post_init_validate_data() self._post_init_validate_server() @@ -688,6 +691,9 @@ async def process_single_datapoint(self, data_point, all_data): "stop_phrases": [self.cfg.stop_phrase] if self.cfg.stop_phrase else None, } + if self.cfg.structured_output is not None: + generation_params["response_format"] = STRUCTURED_OUTPUTS[self.cfg.structured_output] + if self.cfg.code_execution: if self.cfg.override_max_code_executions and self.cfg.total_code_executions_in_prompt is not None: generation_params["max_code_executions"] = data_point["total_code_executions"] diff --git a/nemo_skills/inference/model/base.py b/nemo_skills/inference/model/base.py index 81398a6587..f705e7e06f 100644 --- a/nemo_skills/inference/model/base.py +++ b/nemo_skills/inference/model/base.py @@ -242,6 +242,7 @@ async def generate_async( tools: list[dict] | None = None, include_response: bool = False, extra_body: dict = None, + response_format=None, ) -> dict: if endpoint_type is None: # Infering completion type from prompt @@ -267,6 +268,7 @@ async def generate_async( "reasoning_effort": reasoning_effort, "tools": tools, "extra_body": extra_body, + "response_format": response_format, } # TODO: remove this after we no longer use gpt-oss or it's fixed in vllm diff --git a/nemo_skills/inference/model/gemini.py b/nemo_skills/inference/model/gemini.py index 44764cd833..84c79f742a 100644 --- a/nemo_skills/inference/model/gemini.py +++ b/nemo_skills/inference/model/gemini.py @@ -57,6 +57,7 @@ def _build_chat_request_params( reasoning_effort: str | None, extra_body: dict = None, tools: list[dict] | None = None, + response_format=None, ) -> dict: """ https://github.com/BerriAI/litellm/blob/v1.75.0-nightly/litellm/constants.py#L45-L56 @@ -72,6 +73,8 @@ def _build_chat_request_params( "`repetition_penalty` is not supported by Gemini API, please set it to default value `1.0`." ) assert not extra_body, "`extra_body` is not supported by Gemini API, please set it to None or empty dict" + if response_format is not None: + raise NotImplementedError() # Vertext AI params: https://cloud.google.com/vertex-ai/generative-ai/docs/model-reference/inference # litellm default params: https://github.com/BerriAI/litellm/blob/v1.75.0-nightly/litellm/llms/gemini/chat/transformation.py#L73-L90 diff --git a/nemo_skills/inference/model/megatron.py b/nemo_skills/inference/model/megatron.py index 9a95c9158e..cc904a14a8 100644 --- a/nemo_skills/inference/model/megatron.py +++ b/nemo_skills/inference/model/megatron.py @@ -36,6 +36,7 @@ def _build_chat_request_params( stop_phrases: list[str] | None = None, timeout: int | None = None, top_logprobs: int | None = None, + response_format=None, **kwargs, ) -> dict: # Validations @@ -48,6 +49,7 @@ def _build_chat_request_params( if top_k != -1: raise NotImplementedError("Megatron server does not support top_k parameter.") assert kwargs.get("tools") is None, "Megatron server does not support tools parameter." + assert response_format is None, "Megatron server does not support response_format parameter." params = { "messages": messages, @@ -81,6 +83,7 @@ def _build_completion_request_params( stop_phrases: list[str] | None = None, timeout: int | None = None, top_logprobs: int | None = None, + response_format=None, **kwargs, ) -> dict: # Parameter validation specific to Megatron @@ -93,6 +96,7 @@ def _build_completion_request_params( if top_k != -1: raise NotImplementedError("Megatron server does not support top_k parameter.") assert kwargs.get("tools") is None, "Megatron server does not support tools parameter." + assert response_format is None, "Megatron server does not support response_format parameter." return { "prompt": prompt, diff --git a/nemo_skills/inference/model/openai.py b/nemo_skills/inference/model/openai.py index 6a81ab86a4..a4ec28bf13 100644 --- a/nemo_skills/inference/model/openai.py +++ b/nemo_skills/inference/model/openai.py @@ -69,6 +69,7 @@ def _build_completion_request_params(self, **kwargs) -> dict: assert kwargs.pop("reasoning_effort", None) is None, ( "reasoning_effort is not supported by completion requests." ) + assert kwargs.pop("response_format", None) is None, "response_format is not supported by completion requests." assert kwargs.pop("top_k", -1) == -1, "`top_k` is not supported by OpenAI API, please set it to -1." assert kwargs.pop("min_p", 0.0) == 0.0, "`min_p` is not supported by OpenAI API, please set it to 0.0." assert kwargs.pop("repetition_penalty", 1.0) == 1.0, ( @@ -100,6 +101,7 @@ def _build_chat_request_params( reasoning_effort: str | None, extra_body: dict = None, tools: list[dict] | None = None, + response_format=None, ) -> dict: # Validations if top_k != -1: @@ -118,6 +120,7 @@ def _build_chat_request_params( "timeout": timeout, "stream": stream, "tools": tools, + "response_format": response_format, } if self._is_reasoning_model(self.model): diff --git a/nemo_skills/inference/model/sglang.py b/nemo_skills/inference/model/sglang.py index b4cc7f6564..10550dfed2 100644 --- a/nemo_skills/inference/model/sglang.py +++ b/nemo_skills/inference/model/sglang.py @@ -39,6 +39,7 @@ def _build_chat_request_params( reasoning_effort: str | None = None, tools: list[dict] | None = None, extra_body: dict = None, + response_format=None, ) -> dict: request = super()._build_chat_request_params( messages=messages, @@ -56,6 +57,7 @@ def _build_chat_request_params( reasoning_effort=reasoning_effort, tools=tools, extra_body=extra_body, + response_format=response_format, ) # SGLang requires tool_choice in the request body when tools are provided if tools is not None: diff --git a/nemo_skills/inference/model/vllm.py b/nemo_skills/inference/model/vllm.py index 015e22066a..0b62812568 100644 --- a/nemo_skills/inference/model/vllm.py +++ b/nemo_skills/inference/model/vllm.py @@ -143,9 +143,11 @@ def _build_completion_request_params( reasoning_effort: str | None = None, extra_body: dict = None, tools: list[dict] | None = None, + response_format=None, ) -> dict: assert reasoning_effort is None, "reasoning_effort is not supported for text completion requests" assert tools is None, "tools are not supported for text completion requests" + assert response_format is None, "response_format is not supported for text completion requests" return { "prompt": prompt, "max_tokens": tokens_to_generate, @@ -182,6 +184,7 @@ def _build_chat_request_params( reasoning_effort: str | None = None, tools: list[dict] | None = None, extra_body: dict = None, + response_format=None, ) -> dict: # Process messages to handle image content (VLM support) processed_messages = [] @@ -207,6 +210,7 @@ def _build_chat_request_params( "timeout": timeout, "extra_body": self._build_request_body(top_k, min_p, repetition_penalty, extra_body=extra_body), "tools": tools, + "response_format": response_format, } if reasoning_effort: request["allowed_openai_params"] = ["reasoning_effort"] diff --git a/nemo_skills/inference/structured_outputs.py b/nemo_skills/inference/structured_outputs.py new file mode 100644 index 0000000000..2683819f33 --- /dev/null +++ b/nemo_skills/inference/structured_outputs.py @@ -0,0 +1,29 @@ +# Copyright (c) 2026, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from typing import Literal + +from pydantic import BaseModel + + +class HLEJudgeAAResponseFormat(BaseModel): + extracted_final_answer: str + reasoning: str + correct: Literal["yes", "no"] + confidence: int + + +STRUCTURED_OUTPUTS = { + "HLE_JUDGE_AA": HLEJudgeAAResponseFormat, +} diff --git a/nemo_skills/pipeline/eval.py b/nemo_skills/pipeline/eval.py index fac2f884bd..1f1557a4f9 100644 --- a/nemo_skills/pipeline/eval.py +++ b/nemo_skills/pipeline/eval.py @@ -17,7 +17,7 @@ from collections import defaultdict from copy import deepcopy from pathlib import Path -from typing import List +from typing import List, Optional import typer @@ -458,6 +458,10 @@ def eval( "", help="Additional sbatch kwargs to pass to the job scheduler. Values should be provided as a JSON string or as a `dict` if invoking from code.", ), + metric_type: Optional[str] = typer.Option( + None, + help="Specify metric type to use a specific metric calculator.", + ), metrics_kwargs: str = typer.Option( "", help="Additional kwargs to pass to the metrics calculator. Values should be provided as a JSON string or as a `dict` if invoking from code.", @@ -773,6 +777,8 @@ def eval( command += f" --wandb_project={wandb_project} " if data_dir: command += f" --data_dir={data_dir} " + if metric_type: + command += f" --metric_type={metric_type} " if metrics_kwargs: command += f" --metrics_kwargs='{kwargs_to_string(metrics_kwargs)}' " diff --git a/tests/test_generation.py b/tests/test_generation.py index 4298c738db..8245cfe55b 100644 --- a/tests/test_generation.py +++ b/tests/test_generation.py @@ -192,3 +192,32 @@ def test_server_metadata_from_num_tasks(tmp_path): assert server_cmd.script.num_gpus == server_config["num_gpus"] assert groups[0].hardware.num_gpus == server_config["num_gpus"] assert groups[0].hardware.num_tasks == server_cmd.script.num_tasks + + +def test_judge_generations_with_structured_output(tmp_path): + cmd = ( + f"ns eval " + f" --server_type=openai " + f" --model=nvidia/nemotron-3-nano-30b-a3b " + f" --server_address=https://integrate.api.nvidia.com/v1 " + f" --benchmarks=hle " + f" --output_dir={tmp_path} " + f" --judge_model=nvidia/nemotron-3-nano-30b-a3b " + f" --judge_server_address=https://integrate.api.nvidia.com/v1 " + f" --judge_server_type=openai " + f" --metric_type=hle-aa " + f' --extra_judge_args="++structured_output=HLE_JUDGE_AA" ' + f" ++max_samples=2 " + f" ++inference.tokens_to_generate=1024 " # to make test go fast + ) + subprocess.run(cmd, shell=True, check=True) + + # checking that output exists and has the expected format + with open(f"{tmp_path}/eval-results/hle/output.jsonl") as fin: + data = [json.loads(line) for line in fin.readlines()] + judgements = [json.loads(data[i]["judgement"]) for i in range(len(data))] + expected_keys = {"extracted_final_answer", "reasoning", "correct", "confidence"} + assert set(judgements[0].keys()) == expected_keys + assert set(judgements[1].keys()) == expected_keys + assert judgements[0]["correct"] in {"yes", "no"} + assert judgements[1]["correct"] in {"yes", "no"}