NVIDIA-NeMo · Kipok · Mar 12, 2026 · Mar 5, 2026 · Mar 5, 2026 · Mar 6, 2026
diff --git a/docs/evaluation/scientific-knowledge.md b/docs/evaluation/scientific-knowledge.md
@@ -6,13 +6,15 @@ Nemo-Skills can be used to evaluate an LLM on various STEM datasets.
 
 | <div style="width:55px; display:inline-block; text-align:center">Dataset</div> | <div style="width:105px; display:inline-block; text-align:center">Questions</div> | <div style="width:85px; display:inline-block; text-align:center">Types</div> | <div style="width:145px; display:inline-block; text-align:center">Domain</div> | <div style="width:60px; display:inline-block; text-align:center">Images?</div> | <div style="width:50px; display:inline-block; text-align:center">NS default</div> |
 |:---|:---:|:---:|:---|:---:|:---:|
-| **[HLE](https://huggingface.co/datasets/cais/hle)** | 2500 | Open ended, MCQ | Engineering, Physics, Chemistry, Bio, etc. | Yes | text only |
+| **[HLE](https://huggingface.co/datasets/cais/hle)** | 2,500 | Open ended, MCQ | Engineering, Physics, Chemistry, Bio, etc. | Yes | text only |
+| **[HLE-Verified](https://huggingface.co/datasets/skylenage/HLE-Verified)** | 2,500 | Open ended, MCQ | Engineering, Physics, Chemistry, Bio, etc. | Yes | gold+revision text only |
 | **[GPQA ](https://huggingface.co/datasets/Idavidrein/gpqa)** | 448 (main)<br>198 (diamond)</br>546 (ext.) | MCQ (4) | Physics, Chemistry, Biology | No | diamond |
 | **[SuperGPQA](https://huggingface.co/datasets/m-a-p/SuperGPQA)** | 26,529 | MCQ (≤ 10) | Science, Eng, Humanities, etc. | No | test |
 | **[MMLU-Pro](https://huggingface.co/datasets/TIGER-Lab/MMLU-Pro)** | 12,032 | MCQ (≤ 10) | Multiple subjects | No | test |
 | **[SciCode](https://huggingface.co/datasets/SciCode1/SciCode)** | 80</br>(338 subtasks) | Code gen | Scientific computing | No | test+val |
 | **[FrontierScience](https://huggingface.co/datasets/openai/frontierscience)** | 100 | Short-answer | Physics, Chemistry, Biology | No | all |
 | **[Physics](https://huggingface.co/datasets/desimfj/PHYSICS)** | 1,000 (EN), 1,000 (ZH) | Open-ended | Physics | No | EN |
+| **[UGPhysics](https://huggingface.co/datasets/UGPhysics/ugphysics)** | 5,520 (EN), 5,520 (ZH) | Open-ended MCQ | Physics | No | EN |
 | **[MMLU](https://huggingface.co/datasets/cais/mmlu)** | 14,042 | MCQ (4) | Multiple Subjects | No | test |
 | **[MMLU-Redux](https://huggingface.co/datasets/edinburgh-dawg/mmlu-redux)** | 5,385| MCQ (4) | Multiple Subjects | No | test |
 | **[SimpleQA](https://github.com/openai/simple-evals/)** | 4,326 (test), 1,000 (verified) | Open ended | Factuality, Parametric knowledge| No | verified |

diff --git a/nemo_skills/dataset/hle_verified/__init__.py b/nemo_skills/dataset/hle_verified/__init__.py
@@ -0,0 +1,27 @@
+# Copyright (c) 2026, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# settings that define how evaluation should be done by default (all can be changed from cmdline)
+METRICS_TYPE = "hle"
+GENERATION_ARGS = "++prompt_config=generic/hle ++eval_type=math"
+EVAL_SPLIT = "text"  # text subset of gold + revised subset of HLE-Verified (https://arxiv.org/pdf/2602.13964v3)
+
+# Some answers are not possible to compare symbolically, so have to use a judge model
+# Setting openai judge by default, but can be overriden from command line for a locally hosted model
+JUDGE_PIPELINE_ARGS = {
+    "model": "o3-mini-2025-01-31",
+    "server_type": "openai",
+    "server_address": "https://api.openai.com/v1",
+}
+JUDGE_ARGS = "++prompt_config=judge/hle ++generation_key=judgement ++add_generation_stats=False"
diff --git a/nemo_skills/dataset/hle_verified/prepare.py b/nemo_skills/dataset/hle_verified/prepare.py
@@ -0,0 +1,116 @@
+# Copyright (c) 2026, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import argparse
+import json
+from pathlib import Path
+
+from datasets import load_dataset
+from tqdm import tqdm
+
+HLE_CATEGORIES_MAP = {
+    "Other": "other",
+    "Humanities/Social Science": "human",
+    "Math": "math",
+    "Physics": "phy",
+    "Computer Science/AI": "cs",
+    "Biology/Medicine": "bio",
+    "Chemistry": "chem",
+    "Engineering": "eng",
+}
+
+# Reverse mapping for filtering
+HLE_REVERSE_MAP = {v: k for k, v in HLE_CATEGORIES_MAP.items()}
+
+HLE_VERIFIED_CLASSES_MAP = {
+    "Gold subset": "gold",
+    "Revision subset": "revision",
+    "Uncertain subset": "uncertain",
+}
+
+# Reverse mapping for filtering
+HLE_VERIFIED_CLASSES_REVERSE_MAP = {v: k for k, v in HLE_VERIFIED_CLASSES_MAP.items()}
+
+REPO_ID = "skylenage/HLE-Verified"
+
+
+def load_dataset_from_hub():
+    """Load the dataset from HuggingFace hub.
+
+    Fields not exposed as top-level columns (author_name, rationale, answer_type,
+    canary, image) are stored as a JSON string in the 'json' column and parsed here.
+    """
+    df = load_dataset(REPO_ID, split="train").to_pandas()
+
+    parsed = df["json"].apply(json.loads)
+    for field in ("author_name", "rationale", "answer_type", "canary", "image"):
+        df[field] = parsed.apply(lambda x, f=field: x.get(f))
+
+    return df
+
+
+def format_entry(entry):
+    return {
+        "id": entry["id"],
+        "problem": entry["question"],
+        "expected_answer": entry["answer"],
+        "answer_type": entry["answer_type"],
+        "reference_solution": entry["rationale"],
+        "raw_subject": entry["raw_subject"],
+        "subset_for_metrics": entry["category"],
+        "author_name": entry["author_name"],
+        "canary": entry["canary"],
+        "verified_class": HLE_VERIFIED_CLASSES_MAP.get(entry["Verified_Classes"], entry["Verified_Classes"]),
+    }
+
+
+def write_data_to_file(output_file, data, split):
+    with open(output_file, "wt", encoding="utf-8") as fout:
+        for _, entry in tqdm(data.iterrows(), total=len(data), desc=f"Writing {output_file.name}"):
+            # Filter by category for category-specific splits
+            if split in HLE_REVERSE_MAP and entry["category"] != HLE_REVERSE_MAP[split]:
+                continue
+            # Filter by verified class for class-specific splits
+            if split in HLE_VERIFIED_CLASSES_REVERSE_MAP:
+                if entry["Verified_Classes"] != HLE_VERIFIED_CLASSES_REVERSE_MAP[split]:
+                    continue
+            if entry["image"]:
+                continue
+            # text split = text-only entries from Gold + Revision subsets only
+            if split == "text" and entry["Verified_Classes"] == HLE_VERIFIED_CLASSES_REVERSE_MAP["uncertain"]:
+                continue
+            json.dump(format_entry(entry), fout)
+            fout.write("\n")
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "--split",
+        default="all",
+        choices=("all", "text") + tuple(HLE_CATEGORIES_MAP.values()) + tuple(HLE_VERIFIED_CLASSES_MAP.values()),
+        help="Dataset split to process (all/text/math/other/human/phy/cs/bio/chem/eng/gold/revision/uncertain).",
+    )
+    args = parser.parse_args()
+    dataset = load_dataset_from_hub()
+    data_dir = Path(__file__).absolute().parent
+    data_dir.mkdir(exist_ok=True)
+    if args.split == "all":
+        all_splits = ["text"] + list(HLE_CATEGORIES_MAP.values()) + list(HLE_VERIFIED_CLASSES_MAP.values())
+        for split in all_splits:
+            output_file = data_dir / f"{split}.jsonl"
+            write_data_to_file(output_file, dataset, split)
+    else:
+        output_file = data_dir / f"{args.split}.jsonl"
+        write_data_to_file(output_file, dataset, args.split)
diff --git a/nemo_skills/dataset/ugphysics/__init__.py b/nemo_skills/dataset/ugphysics/__init__.py
@@ -0,0 +1,27 @@
+# Copyright (c) 2026, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# settings that define how evaluation should be done by default (all can be changed from cmdline)
+METRICS_TYPE = "ugphysics"
+GENERATION_ARGS = "++prompt_config=generic/ugphysics ++eval_type=math"
+EVAL_SPLIT = "en"
+
+# Setting openai judge by default, but can be overriden from command line for a locally hosted model
+# Currently using o4-mini-2025-04-16
+JUDGE_PIPELINE_ARGS = {
+    "model": "o4-mini-2025-04-16",
+    "server_type": "openai",
+    "server_address": "https://api.openai.com/v1",
+}
+JUDGE_ARGS = "++prompt_config=judge/ugphysics ++generation_key=judgement ++add_generation_stats=False"
diff --git a/nemo_skills/dataset/ugphysics/prepare.py b/nemo_skills/dataset/ugphysics/prepare.py
@@ -0,0 +1,126 @@
+# Copyright (c) 2026, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import argparse
+import json
+from pathlib import Path
+
+from datasets import load_dataset
+from tqdm import tqdm
+
+# From https://github.com/YangLabHKUST/UGPhysics/blob/main/codes/utils.py#L126
+OB_ANS_TYPE_ID2EN = {
+    "IN": "a range interval",
+    "TF": "either True or False",
+    "EX": "an expression",
+    "EQ": "an equation",
+    "MC": "one option of a multiple choice question",
+    "NV": "a numerical value without units",
+    "TUP": "multiple numbers, separated by comma, such as (x, y, z)",
+}
+
+SUBSETS = [
+    "AtomicPhysics",
+    "ClassicalElectromagnetism",
+    "ClassicalMechanics",
+    "Electrodynamics",
+    "GeometricalOptics",
+    "QuantumMechanics",
+    "Relativity",
+    "SemiconductorPhysics",
+    "Solid-StatePhysics",
+    "StatisticalMechanics",
+    "TheoreticalMechanics",
+    "Thermodynamics",
+    "WaveOptics",
+]
+
+
+def get_prompt_sentence(answer_type, is_multiple_answer):
+    """Build the prompt sentence describing the expected answer format.
+    Adapted from https://github.com/YangLabHKUST/UGPhysics/blob/main/codes/utils.py#L146
+    """
+    types = [t.strip() for t in answer_type.split(",")]
+    descriptions = [OB_ANS_TYPE_ID2EN.get(t, t) for t in types]
-    descriptions = [OB_ANS_TYPE_ID2EN.get(t, t) for t in types]
+    descriptions = [OB_ANS_TYPE_ID2EN[t] for t in types]
-    descriptions = [OB_ANS_TYPE_ID2EN.get(t, t) for t in types]
+    descriptions = [OB_ANS_TYPE_ID2EN[t] for t in types]
+    if not is_multiple_answer:
+        return f"The answer of the problem should be {descriptions[0]}."
+    elif len(set(descriptions)) == 1:
+        return f"The problem has multiple answers, each of them should be {descriptions[0]}."
+    else:
+        return f"The problem has multiple answers, with the answers in order being {', '.join(descriptions)}."
+
+
+def get_boxed_answer_example(is_multiple_answer):
+    """Get the boxed answer placeholder string for the prompt."""
+    if is_multiple_answer:
+        return r"\boxed{multiple answers connected with commas}"
+    return r"\boxed{answer}(unit)"
+
+
+def format_entry(entry):
+    is_multiple_answer = entry["is_multiple_answer"]
+    answer_type = entry["answer_type"]
+    return {
+        "index": entry["index"],
+        "problem": entry["problem"],
+        "expected_answer": entry["answers"],
+        "solution": entry["solution"],
+        "answer_type": answer_type,
+        "subset_for_metrics": entry["subject"],
+        "language": entry["language"].lower(),
+        "is_multiple_answer": is_multiple_answer,
+        "prompt_sentence": get_prompt_sentence(answer_type, is_multiple_answer),
+        "boxed_answer_example": get_boxed_answer_example(is_multiple_answer),
+    }
+
+
+def load_data(lang_split):
+    data = []
+    for subset in tqdm(SUBSETS, desc=f"Loading {lang_split} subsets"):
+        subset_data = load_dataset("UGPhysics/ugphysics", subset, split=lang_split)
+        data.extend(subset_data)
+    return data
+
+
+def save_data(data, output_path):
+    output_path = Path(output_path)
+    output_path.parent.mkdir(parents=True, exist_ok=True)
+    with open(output_path, "wt", encoding="utf-8") as fout:
+        for entry in tqdm(data, desc=f"Writing {output_path.name}"):
+            json.dump(format_entry(entry), fout)
+            fout.write("\n")
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--split", default="all", choices=("all", "en", "zh", "en_zh"))
+    args = parser.parse_args()
+
+    data_dir = Path(__file__).absolute().parent
+    data_dir.mkdir(exist_ok=True)
+
+    if args.split == "all":
+        en_data = load_data("en")
+        save_data(en_data, data_dir / "en.jsonl")
+        zh_data = load_data("zh")
+        save_data(zh_data, data_dir / "zh.jsonl")
+        save_data(en_data + zh_data, data_dir / "en_zh.jsonl")
+    else:
+        if args.split == "en_zh":
+            en_data = load_data("en")
+            zh_data = load_data("zh")
+            data = en_data + zh_data
+        else:
+            data = load_data(args.split)
+        save_data(data, data_dir / f"{args.split}.jsonl")
diff --git a/nemo_skills/evaluation/metrics/map_metrics.py b/nemo_skills/evaluation/metrics/map_metrics.py
@@ -45,11 +45,13 @@
 from nemo_skills.evaluation.metrics.simpleqa_metrics import SimpleQAMetrics
 from nemo_skills.evaluation.metrics.specdec_metrics import SpecdecMetrics
 from nemo_skills.evaluation.metrics.translation_metrics import TranslationMetrics
+from nemo_skills.evaluation.metrics.ugphysics_metrics import UGPhysicsMetrics
 
 METRICS_MAP = {
     "math": MathMetrics,
     "hle": functools.partial(MathMetrics, compute_no_answer=False, answer_key="generation"),
     "physics": PhysicsMetrics,
+    "ugphysics": UGPhysicsMetrics,
     "hle-aa": functools.partial(HLEAAMetrics, compute_no_answer=False, answer_key="generation"),
     "frontierscience-olympiad": functools.partial(
         MathMetrics, compute_no_answer=False, question_key="question", answer_key="generation"

diff --git a/nemo_skills/evaluation/metrics/ugphysics_metrics.py b/nemo_skills/evaluation/metrics/ugphysics_metrics.py
@@ -0,0 +1,51 @@
+# Copyright (c) 2026, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import logging
+import re
+
+from nemo_skills.evaluation.metrics.math_metrics import MathMetrics
+from nemo_skills.utils import get_logger_name
+
+LOG = logging.getLogger(get_logger_name(__file__))
+
+
+class UGPhysicsMetrics(MathMetrics):
+    def __init__(self, compute_no_answer: bool = False, answer_key: str = "generation"):
+        super().__init__(compute_no_answer=compute_no_answer)
+        self.answer_key = answer_key
+
+    def is_correct_judgement(self, judgement: str, return_none: bool = False) -> bool:
+        """Parse UGPhysics judgement that returns TRUE or FALSE in ## Equivalence Judgement section."""
+        if judgement:
+            # Look for the Equivalence Judgement section
+            equiv_match = re.search(r"##\s*Equivalence\s*Judgement\s*\n\s*(TRUE|FALSE)", judgement, re.IGNORECASE)
+            if equiv_match:
+                return equiv_match.group(1).upper() == "TRUE"
+            # Fallback: look for standalone TRUE/FALSE (case-insensitive), use last match
+            true_false_matches = list(re.finditer(r"\b(TRUE|FALSE)\b", judgement, re.IGNORECASE))
+            if true_false_matches:
+                return true_false_matches[-1].group(1).upper() == "TRUE"
+
+        # improper judgement format, so have to judge as false
+        return None if return_none else False
+
+    def get_incorrect_sample(self, prediction: dict) -> dict:
+        prediction = prediction.copy()
+        if "symbolic_correct" in prediction:
+            prediction["symbolic_correct"] = False
+        if "judgement" in prediction:
+            prediction["judgement"] = "FALSE"
+        prediction[self.answer_key] = None
+        return prediction
diff --git a/nemo_skills/prompt/config/generic/ugphysics.yaml b/nemo_skills/prompt/config/generic/ugphysics.yaml
@@ -0,0 +1,10 @@
+# adapted from https://github.com/YangLabHKUST/UGPhysics
+# {prompt_sentence} is the prompt sentence describing the expected answer format for questions that have single or multiple answers
+
+user: |-
+    The following is an open-ended problem from {subset_for_metrics} of the undergraduate-level Physics.
+    {prompt_sentence}
+    Please calculate the answer according to the given requirements and the information provided.
+    Please use LaTeX format to represent the variables and formulas used in the solution process and results.
+    Please end your solution with "So the final answer is {boxed_answer_example}." and give the result explicitly, note that the unit of the answer should not be included in \boxed{{}}.
+    Problem: {problem}