NVIDIA-NeMo · Kipok · Feb 7, 2026 · Jan 26, 2026 · Jan 26, 2026 · Jan 26, 2026
diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml
@@ -91,5 +91,5 @@ jobs:
         docker run --rm --network=host -v /nemo_run:/nemo_run nemo-skills-sandbox-image &
         sleep 10
         set -o pipefail # this will make sure next line returns non-0 exit code if tests fail
-        ns prepare_data gsm8k math-500
+        ns prepare_data gsm8k math-500 hle
         python -m pytest tests/ -m "not gpu" --junitxml=pytest.xml --cov-report=term-missing:skip-covered --cov=nemo_skills --cov=pipeline --durations=30 -rs -s -vvv
diff --git a/nemo_skills/evaluation/metrics/hleaa_metrics.py b/nemo_skills/evaluation/metrics/hleaa_metrics.py
@@ -0,0 +1,39 @@
+# Copyright (c) 2026, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import json
+import logging
+
+from nemo_skills.evaluation.metrics.math_metrics import MathMetrics
+from nemo_skills.utils import get_logger_name
+
+LOG = logging.getLogger(get_logger_name(__file__))
+
+
+class HLEAAMetrics(MathMetrics):
+    """Metrics for HLE with judge structured output for AA-compatibility."""
+
+    def _postprocess_judgement(self, prediction: dict) -> dict:
+        prediction = prediction.copy()
+        try:
+            judgement = json.loads(prediction["judgement"])
+            prediction["judgement"] = "Judgement: {}".format(judgement["correct"])
+        except (json.JSONDecodeError, KeyError) as e:
+            LOG.debug(f"Failed to parse structured output judgement: {e}")
+            prediction["judgement"] = "Judgement: FAILED_TO_POSTPROCESS"
+        return prediction
+
+    def update(self, predictions):
+        preprocessed_predictions = [self._postprocess_judgement(pred) for pred in predictions]
+        super().update(preprocessed_predictions)
diff --git a/nemo_skills/evaluation/metrics/map_metrics.py b/nemo_skills/evaluation/metrics/map_metrics.py
@@ -32,6 +32,7 @@
     SweBenchMetrics,
 )
 from nemo_skills.evaluation.metrics.gradingbench_metrics import GradingBenchMetrics
+from nemo_skills.evaluation.metrics.hleaa_metrics import HLEAAMetrics
 from nemo_skills.evaluation.metrics.icpc_metrics import ICPCMetrics
 from nemo_skills.evaluation.metrics.if_metrics import IFMetrics
 from nemo_skills.evaluation.metrics.ioi_metrics import IOIMetrics
@@ -47,6 +48,7 @@
 METRICS_MAP = {
     "math": MathMetrics,
     "hle": functools.partial(MathMetrics, compute_no_answer=False, answer_key="generation"),
+    "hle-aa": functools.partial(HLEAAMetrics, compute_no_answer=False, answer_key="generation"),
     "frontierscience-olympiad": functools.partial(
         MathMetrics, compute_no_answer=False, question_key="question", answer_key="generation"
     ),

diff --git a/nemo_skills/inference/generate.py b/nemo_skills/inference/generate.py
@@ -47,6 +47,7 @@
     server_params,
 )
 from nemo_skills.inference.model.base import EndpointType
+from nemo_skills.inference.structured_outputs import STRUCTURED_OUTPUTS
 from nemo_skills.prompt.utils import get_prompt, get_token_count
 from nemo_skills.utils import (
     chunk_data,
@@ -221,6 +222,8 @@ class GenerationTaskConfig:
     eval_type: str | None = None  # "lean4-proof", "math", etc.
     eval_config: dict = field(default_factory=dict)  # Config for the evaluator
 
+    structured_output: str | None = None
+
     def __post_init__(self):
         self._post_init_validate_data()
         self._post_init_validate_server()
@@ -688,6 +691,9 @@ async def process_single_datapoint(self, data_point, all_data):
             "stop_phrases": [self.cfg.stop_phrase] if self.cfg.stop_phrase else None,
         }
 
+        if self.cfg.structured_output is not None:
+            generation_params["response_format"] = STRUCTURED_OUTPUTS[self.cfg.structured_output]
+
         if self.cfg.code_execution:
             if self.cfg.override_max_code_executions and self.cfg.total_code_executions_in_prompt is not None:
                 generation_params["max_code_executions"] = data_point["total_code_executions"]

diff --git a/nemo_skills/inference/model/base.py b/nemo_skills/inference/model/base.py
@@ -242,6 +242,7 @@ async def generate_async(
         tools: list[dict] | None = None,
         include_response: bool = False,
         extra_body: dict = None,
+        response_format=None,
     ) -> dict:
         if endpoint_type is None:
             # Infering completion type from prompt
@@ -267,6 +268,7 @@ async def generate_async(
             "reasoning_effort": reasoning_effort,
             "tools": tools,
             "extra_body": extra_body,
+            "response_format": response_format,
         }
 
         # TODO: remove this after we no longer use gpt-oss or it's fixed in vllm

diff --git a/nemo_skills/inference/model/gemini.py b/nemo_skills/inference/model/gemini.py
@@ -57,6 +57,7 @@ def _build_chat_request_params(
         reasoning_effort: str | None,
         extra_body: dict = None,
         tools: list[dict] | None = None,
+        response_format=None,
     ) -> dict:
         """
         https://github.com/BerriAI/litellm/blob/v1.75.0-nightly/litellm/constants.py#L45-L56
@@ -72,6 +73,8 @@ def _build_chat_request_params(
             "`repetition_penalty` is not supported by Gemini API, please set it to default value `1.0`."
         )
         assert not extra_body, "`extra_body` is not supported by Gemini API, please set it to None or empty dict"
+        if response_format is not None:
+            raise NotImplementedError()
 
         # Vertext AI params: https://cloud.google.com/vertex-ai/generative-ai/docs/model-reference/inference
         # litellm default params: https://github.com/BerriAI/litellm/blob/v1.75.0-nightly/litellm/llms/gemini/chat/transformation.py#L73-L90

diff --git a/nemo_skills/inference/model/megatron.py b/nemo_skills/inference/model/megatron.py
@@ -36,6 +36,7 @@ def _build_chat_request_params(
         stop_phrases: list[str] | None = None,
         timeout: int | None = None,
         top_logprobs: int | None = None,
+        response_format=None,
         **kwargs,
     ) -> dict:
         # Validations
@@ -48,6 +49,7 @@ def _build_chat_request_params(
         if top_k != -1:
             raise NotImplementedError("Megatron server does not support top_k parameter.")
         assert kwargs.get("tools") is None, "Megatron server does not support tools parameter."
+        assert response_format is None, "Megatron server does not support response_format parameter."
 
         params = {
             "messages": messages,
@@ -81,6 +83,7 @@ def _build_completion_request_params(
         stop_phrases: list[str] | None = None,
         timeout: int | None = None,
         top_logprobs: int | None = None,
+        response_format=None,
         **kwargs,
     ) -> dict:
         # Parameter validation specific to Megatron
@@ -93,6 +96,7 @@ def _build_completion_request_params(
         if top_k != -1:
             raise NotImplementedError("Megatron server does not support top_k parameter.")
         assert kwargs.get("tools") is None, "Megatron server does not support tools parameter."
+        assert response_format is None, "Megatron server does not support response_format parameter."
 
         return {
             "prompt": prompt,

diff --git a/nemo_skills/inference/model/openai.py b/nemo_skills/inference/model/openai.py
@@ -69,6 +69,7 @@ def _build_completion_request_params(self, **kwargs) -> dict:
         assert kwargs.pop("reasoning_effort", None) is None, (
             "reasoning_effort is not supported by completion requests."
         )
+        assert kwargs.pop("response_format", None) is None, "response_format is not supported by completion requests."
         assert kwargs.pop("top_k", -1) == -1, "`top_k` is not supported by OpenAI API, please set it to -1."
         assert kwargs.pop("min_p", 0.0) == 0.0, "`min_p` is not supported by OpenAI API, please set it to 0.0."
         assert kwargs.pop("repetition_penalty", 1.0) == 1.0, (
@@ -100,6 +101,7 @@ def _build_chat_request_params(
         reasoning_effort: str | None,
         extra_body: dict = None,
         tools: list[dict] | None = None,
+        response_format=None,
     ) -> dict:
         # Validations
         if top_k != -1:
@@ -118,6 +120,7 @@ def _build_chat_request_params(
             "timeout": timeout,
             "stream": stream,
             "tools": tools,
+            "response_format": response_format,
         }
 
         if self._is_reasoning_model(self.model):

diff --git a/nemo_skills/inference/model/sglang.py b/nemo_skills/inference/model/sglang.py
@@ -39,6 +39,7 @@ def _build_chat_request_params(
         reasoning_effort: str | None = None,
         tools: list[dict] | None = None,
         extra_body: dict = None,
+        response_format=None,
     ) -> dict:
         request = super()._build_chat_request_params(
             messages=messages,
@@ -56,6 +57,7 @@ def _build_chat_request_params(
             reasoning_effort=reasoning_effort,
             tools=tools,
             extra_body=extra_body,
+            response_format=response_format,
         )
         # SGLang requires tool_choice in the request body when tools are provided
         if tools is not None:

diff --git a/nemo_skills/inference/model/vllm.py b/nemo_skills/inference/model/vllm.py
@@ -143,9 +143,11 @@ def _build_completion_request_params(
         reasoning_effort: str | None = None,
         extra_body: dict = None,
         tools: list[dict] | None = None,
+        response_format=None,
     ) -> dict:
         assert reasoning_effort is None, "reasoning_effort is not supported for text completion requests"
         assert tools is None, "tools are not supported for text completion requests"
+        assert response_format is None, "response_format is not supported for text completion requests"
         return {
             "prompt": prompt,
             "max_tokens": tokens_to_generate,
@@ -182,6 +184,7 @@ def _build_chat_request_params(
         reasoning_effort: str | None = None,
         tools: list[dict] | None = None,
         extra_body: dict = None,
+        response_format=None,
     ) -> dict:
         # Process messages to handle image content (VLM support)
         processed_messages = []
@@ -207,6 +210,7 @@ def _build_chat_request_params(
             "timeout": timeout,
             "extra_body": self._build_request_body(top_k, min_p, repetition_penalty, extra_body=extra_body),
             "tools": tools,
+            "response_format": response_format,
         }
         if reasoning_effort:
             request["allowed_openai_params"] = ["reasoning_effort"]

diff --git a/nemo_skills/inference/structured_outputs.py b/nemo_skills/inference/structured_outputs.py
@@ -0,0 +1,29 @@
+# Copyright (c) 2026, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from typing import Literal
+
+from pydantic import BaseModel
+
+
+class HLEJudgeAAResponseFormat(BaseModel):
+    extracted_final_answer: str
+    reasoning: str
+    correct: Literal["yes", "no"]
+    confidence: int
-    confidence: int
+    confidence: int = Field(ge=0, le=100, description="Confidence score from 0 to 100")
-    confidence: int
+    confidence: int = Field(ge=0, le=100, description="Confidence score from 0 to 100")
+
+
+STRUCTURED_OUTPUTS = {
+    "HLE_JUDGE_AA": HLEJudgeAAResponseFormat,
+}
diff --git a/nemo_skills/pipeline/eval.py b/nemo_skills/pipeline/eval.py
@@ -17,7 +17,7 @@
 from collections import defaultdict
 from copy import deepcopy
 from pathlib import Path
-from typing import List
+from typing import List, Optional
 
 import typer
 
@@ -458,6 +458,10 @@ def eval(
         "",
         help="Additional sbatch kwargs to pass to the job scheduler. Values should be provided as a JSON string or as a `dict` if invoking from code.",
     ),
+    metric_type: Optional[str] = typer.Option(
+        None,
+        help="Specify metric type to use a specific metric calculator.",
+    ),
     metrics_kwargs: str = typer.Option(
         "",
         help="Additional kwargs to pass to the metrics calculator. Values should be provided as a JSON string or as a `dict` if invoking from code.",
@@ -773,6 +777,8 @@ def eval(
                     command += f" --wandb_project={wandb_project} "
                 if data_dir:
                     command += f" --data_dir={data_dir} "
+                if metric_type:
+                    command += f" --metric_type={metric_type} "
                 if metrics_kwargs:
                     command += f" --metrics_kwargs='{kwargs_to_string(metrics_kwargs)}' "
 

diff --git a/tests/test_generation.py b/tests/test_generation.py
@@ -192,3 +192,32 @@ def test_server_metadata_from_num_tasks(tmp_path):
     assert server_cmd.script.num_gpus == server_config["num_gpus"]
     assert groups[0].hardware.num_gpus == server_config["num_gpus"]
     assert groups[0].hardware.num_tasks == server_cmd.script.num_tasks
+
+
+def test_judge_generations_with_structured_output(tmp_path):
+    cmd = (
+        f"ns eval "
+        f"    --server_type=openai "
+        f"    --model=nvidia/nemotron-3-nano-30b-a3b "
+        f"    --server_address=https://integrate.api.nvidia.com/v1 "
+        f"    --benchmarks=hle "
+        f"    --output_dir={tmp_path} "
+        f"    --judge_model=nvidia/nemotron-3-nano-30b-a3b "
+        f"    --judge_server_address=https://integrate.api.nvidia.com/v1 "
+        f"    --judge_server_type=openai "
+        f"    --metric_type=hle-aa "
+        f'    --extra_judge_args="++structured_output=HLE_JUDGE_AA" '
+        f"    ++max_samples=2 "
+        f"    ++inference.tokens_to_generate=1024 "  # to make test go fast
+    )
+    subprocess.run(cmd, shell=True, check=True)
+
+    # checking that output exists and has the expected format
+    with open(f"{tmp_path}/eval-results/hle/output.jsonl") as fin:
+        data = [json.loads(line) for line in fin.readlines()]
+    judgements = [json.loads(data[i]["judgement"]) for i in range(len(data))]
+    expected_keys = {"extracted_final_answer", "reasoning", "correct", "confidence"}
+    assert set(judgements[0].keys()) == expected_keys
+    assert set(judgements[1].keys()) == expected_keys
+    assert judgements[0]["correct"] in {"yes", "no"}
+    assert judgements[1]["correct"] in {"yes", "no"}