-
Notifications
You must be signed in to change notification settings - Fork 163
support structured outputs in hle judge for optional AA compatibility #1186
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
0379e3c
c7a5c6a
493a793
ff96906
f86bc95
54c8bc0
534a6c0
adcff37
8442962
8509918
ba33ee1
cf9725b
52d59b4
6539eae
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,39 @@ | ||
| # Copyright (c) 2026, NVIDIA CORPORATION. All rights reserved. | ||
| # | ||
| # Licensed under the Apache License, Version 2.0 (the "License"); | ||
| # you may not use this file except in compliance with the License. | ||
| # You may obtain a copy of the License at | ||
| # | ||
| # http://www.apache.org/licenses/LICENSE-2.0 | ||
| # | ||
| # Unless required by applicable law or agreed to in writing, software | ||
| # distributed under the License is distributed on an "AS IS" BASIS, | ||
| # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||
| # See the License for the specific language governing permissions and | ||
| # limitations under the License. | ||
|
|
||
| import json | ||
| import logging | ||
|
|
||
| from nemo_skills.evaluation.metrics.math_metrics import MathMetrics | ||
| from nemo_skills.utils import get_logger_name | ||
|
|
||
| LOG = logging.getLogger(get_logger_name(__file__)) | ||
|
|
||
|
|
||
| class HLEAAMetrics(MathMetrics): | ||
| """Metrics for HLE with judge structured output for AA-compatibility.""" | ||
|
|
||
| def _postprocess_judgement(self, prediction: dict) -> dict: | ||
| prediction = prediction.copy() | ||
| try: | ||
| judgement = json.loads(prediction["judgement"]) | ||
| prediction["judgement"] = "Judgement: {}".format(judgement["correct"]) | ||
| except (json.JSONDecodeError, KeyError) as e: | ||
| LOG.debug(f"Failed to parse structured output judgement: {e}") | ||
| prediction["judgement"] = "Judgement: FAILED_TO_POSTPROCESS" | ||
| return prediction | ||
|
|
||
| def update(self, predictions): | ||
| preprocessed_predictions = [self._postprocess_judgement(pred) for pred in predictions] | ||
| super().update(preprocessed_predictions) |
| Original file line number | Diff line number | Diff line change | ||||
|---|---|---|---|---|---|---|
| @@ -0,0 +1,29 @@ | ||||||
| # Copyright (c) 2026, NVIDIA CORPORATION. All rights reserved. | ||||||
| # | ||||||
| # Licensed under the Apache License, Version 2.0 (the "License"); | ||||||
| # you may not use this file except in compliance with the License. | ||||||
| # You may obtain a copy of the License at | ||||||
| # | ||||||
| # http://www.apache.org/licenses/LICENSE-2.0 | ||||||
| # | ||||||
| # Unless required by applicable law or agreed to in writing, software | ||||||
| # distributed under the License is distributed on an "AS IS" BASIS, | ||||||
| # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||||||
| # See the License for the specific language governing permissions and | ||||||
| # limitations under the License. | ||||||
|
|
||||||
| from typing import Literal | ||||||
|
|
||||||
| from pydantic import BaseModel | ||||||
anowaczynski-nvidia marked this conversation as resolved.
Show resolved
Hide resolved
|
||||||
|
|
||||||
|
|
||||||
| class HLEJudgeAAResponseFormat(BaseModel): | ||||||
| extracted_final_answer: str | ||||||
| reasoning: str | ||||||
| correct: Literal["yes", "no"] | ||||||
| confidence: int | ||||||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
Suggested change
Note: If this suggestion doesn't match your team's coding style, reply to this and let me know. I'll remember it for next time! |
||||||
|
|
||||||
|
|
||||||
| STRUCTURED_OUTPUTS = { | ||||||
| "HLE_JUDGE_AA": HLEJudgeAAResponseFormat, | ||||||
| } | ||||||
anowaczynski-nvidia marked this conversation as resolved.
Show resolved
Hide resolved
|
||||||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -192,3 +192,32 @@ def test_server_metadata_from_num_tasks(tmp_path): | |
| assert server_cmd.script.num_gpus == server_config["num_gpus"] | ||
| assert groups[0].hardware.num_gpus == server_config["num_gpus"] | ||
| assert groups[0].hardware.num_tasks == server_cmd.script.num_tasks | ||
|
|
||
|
|
||
| def test_judge_generations_with_structured_output(tmp_path): | ||
| cmd = ( | ||
| f"ns eval " | ||
| f" --server_type=openai " | ||
| f" --model=nvidia/nemotron-3-nano-30b-a3b " | ||
| f" --server_address=https://integrate.api.nvidia.com/v1 " | ||
| f" --benchmarks=hle " | ||
| f" --output_dir={tmp_path} " | ||
| f" --judge_model=nvidia/nemotron-3-nano-30b-a3b " | ||
| f" --judge_server_address=https://integrate.api.nvidia.com/v1 " | ||
| f" --judge_server_type=openai " | ||
| f" --metric_type=hle-aa " | ||
| f' --extra_judge_args="++structured_output=HLE_JUDGE_AA" ' | ||
| f" ++max_samples=2 " | ||
| f" ++inference.tokens_to_generate=1024 " # to make test go fast | ||
| ) | ||
| subprocess.run(cmd, shell=True, check=True) | ||
|
Comment on lines
+197
to
+213
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Networked integration test
|
||
|
|
||
| # checking that output exists and has the expected format | ||
| with open(f"{tmp_path}/eval-results/hle/output.jsonl") as fin: | ||
| data = [json.loads(line) for line in fin.readlines()] | ||
| judgements = [json.loads(data[i]["judgement"]) for i in range(len(data))] | ||
| expected_keys = {"extracted_final_answer", "reasoning", "correct", "confidence"} | ||
| assert set(judgements[0].keys()) == expected_keys | ||
| assert set(judgements[1].keys()) == expected_keys | ||
| assert judgements[0]["correct"] in {"yes", "no"} | ||
| assert judgements[1]["correct"] in {"yes", "no"} | ||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Unhandled invalid key
When
structured_outputis set to any non-Nonevalue that is not present inSTRUCTURED_OUTPUTS,process_single_datapointwill throw aKeyErroratSTRUCTURED_OUTPUTS[self.cfg.structured_output]. Since this is a user-provided config value (Hydra/CLI via++structured_output=...), this becomes an unhelpful crash path. Consider validatingstructured_outputinGenerationTaskConfig.__post_init__(or using.get()with an explicitValueErrorlisting allowed keys) so users get a clear error message.