NVIDIA-NeMo · Kipok · Aug 11, 2025 · Jul 31, 2025 · Jul 31, 2025 · Jul 31, 2025
diff --git a/README.md b/README.md
@@ -15,7 +15,7 @@ Here are some of the features we support:
     - Coding skills: scicode, livecodebench, human-eval, mbpp
     - Chat/instruction following: ifbench, ifeval, arena-hard
     - General knowledge: mmlu, mmlu-pro, gpqa
-    - Long context: ruler
+    - Long context: ruler, mrcr
   - Easily parallelize each evaluation across many slurm jobs, self-host LLM judges, bring your own prompts or change benchmark configuration in any other way.
 - [Model training](https://nvidia.github.io/NeMo-Skills/pipelines/training): Train models using [NeMo-Aligner](https://github.com/NVIDIA/NeMo-Aligner/), [NeMo-RL](https://github.com/NVIDIA/NeMo-RL/) or [verl](https://github.com/volcengine/verl).
 

diff --git a/nemo_skills/dataset/mrcr/__init__.py b/nemo_skills/dataset/mrcr/__init__.py
@@ -0,0 +1,19 @@
+# Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+EVAL_SPLIT = 'all'
+PROMPT_CONFIG = 'null'
+DATASET_GROUP = 'long-context'
+METRICS_TYPE = 'mrcr'
+EVAL_ARGS = '++eval_type=mrcr'
+GENERATION_ARGS = '++prompt_format=openai'
diff --git a/nemo_skills/dataset/mrcr/prepare.py b/nemo_skills/dataset/mrcr/prepare.py
@@ -0,0 +1,102 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import argparse
+import json
+import subprocess
+from pathlib import Path
+
+import tiktoken
+from datasets import load_dataset
+from tqdm import tqdm
+
+"""
+Usage
+# default. setup is all.
+python prepare.py
+
+# prepare subset needle2_128k.
+python prepare.py --max_context_window 131072 --needles_subset 2 --setup needle2_128k
+python prepare.py --max_context_window 131072 --needles_subset 2 4 --setup needle2_needle_4_128k
+"""
+
+
+def count_n_tokens(messages: list[dict]) -> int:
+    """
+    Follow the official way to count tokens in messages.
+    with tokenizer o200k_base
+    """
+    enc = tiktoken.get_encoding("o200k_base")
+    return sum([len(enc.encode(m["content"])) for m in messages])
+
+
+def write_data_to_file(output_file, data, max_context_window, needles_subset):
+    with open(output_file, "wt", encoding="utf-8") as fout:
+        for idx, entry in tqdm(enumerate(data), desc=f"Writing {output_file.name}"):
+            messages = json.loads(entry["prompt"])
+
+            if entry['n_needles'] not in needles_subset:
+                print(f"Skipping {idx} because it has {entry['n_needles']} needle")
+                continue
+
+            # find n_tokens
+            n_tokens = count_n_tokens(messages)
+            if max_context_window is not None:
+                if n_tokens > max_context_window:
+                    print(f"Skipping {idx} because it has {n_tokens} tokens")
+                    continue
+
+            entry['messages'] = entry.pop('prompt')
+            entry['expected_answer'] = entry.pop('answer')
+            entry['n_tokens'] = n_tokens
+            json.dump(entry, fout)
+            fout.write("\n")
+
+
+def get_mrcr_data(needles_subset, setup, max_context_window):
+    dataset = load_dataset("openai/mrcr")['train']
+    data_dir = Path(__file__).absolute().parent
+
+    output_file = data_dir / f"{setup}.jsonl"
+    write_data_to_file(output_file, dataset, max_context_window, needles_subset)
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(description="Prepare MRCR dataset.")
+    parser.add_argument(
+        "--max_context_window",
+        type=int,
+        default=None,
+        help="Maximum context window size.",
+    )
+    parser.add_argument(
+        "--needles_subset",
+        nargs="+",
+        type=int,
+        choices=[2, 4, 8],
+        default=[2, 4, 8],
+        help="Needles subset to include.",
+    )
+
+    parser.add_argument(
+        "--setup",
+        type=str,
+        default="all",
+        help="setup name. e.g. all or <needle2>_<128k>",
+    )
+
+    args = parser.parse_args()
+
+    print(f"Preparing MRCR dataset with additional arguments: {args}")
+    get_mrcr_data(args.needles_subset, args.setup, args.max_context_window)
+    print(f"MRCR dataset preparation with setup {args.setup} completed. Use --split=${args.setup} to evaluate!")
diff --git a/nemo_skills/evaluation/evaluator/__init__.py b/nemo_skills/evaluation/evaluator/__init__.py
@@ -22,6 +22,7 @@
 from nemo_skills.evaluation.evaluator.mcq import eval_mcq
 from nemo_skills.evaluation.evaluator.ruler import eval_ruler
 from nemo_skills.evaluation.evaluator.scicode import eval_scicode
+from nemo_skills.evaluation.evaluator.mrcr import eval_mrcr
 
 
 def dummy_eval(cfg):
@@ -43,6 +44,7 @@ def dummy_eval(cfg):
     'livecodebench': eval_livecodebench,
     'livecodebench_pro': eval_livecodebench_pro,
     'scicode': eval_scicode,
+    'mrcr': eval_mrcr,
 }
 
 

diff --git a/nemo_skills/evaluation/evaluator/mrcr.py b/nemo_skills/evaluation/evaluator/mrcr.py
@@ -0,0 +1,48 @@
+# Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import json
+import logging
+from tqdm import tqdm
+from nemo_skills.utils import get_logger_name,  unroll_files
+from difflib import SequenceMatcher
+
+LOG = logging.getLogger(get_logger_name(__file__))
+
+
+def eval_mrcr(cfg):
+    def grade(response, answer, random_string_to_prepend) -> float:
+        """
+        Compare response and answer.
+        # Offical grading function: https://huggingface.co/datasets/openai/mrcr
+        """
+        if not response.startswith(random_string_to_prepend):
+            return 0
+        response = response.removeprefix(random_string_to_prepend)
+        answer = answer.removeprefix(random_string_to_prepend)
+        return float(SequenceMatcher(None, response, answer).ratio())
+
+
+
+    for file in unroll_files(cfg.input_files):
+        with open(file, 'rt', encoding='utf-8') as fin:
+            data = [json.loads(line) for line in fin]
+        with open(file, 'wt', encoding='utf-8') as fout:
+            for sample in tqdm(data):
+                sample['seq_match_ratio'] = grade(
+                    sample['generation'], 
+                    sample['expected_answer'], 
+                    sample['random_string_to_prepend']
+                    )
+                fout.write(json.dumps(sample) + "\n")
diff --git a/nemo_skills/evaluation/metrics/map_metrics.py b/nemo_skills/evaluation/metrics/map_metrics.py
@@ -19,6 +19,7 @@
 from nemo_skills.evaluation.metrics.lean4_metrics import Lean4Metrics
 from nemo_skills.evaluation.metrics.math_metrics import MathMetrics
 from nemo_skills.evaluation.metrics.ruler_metrics import RulerMetrics
+from nemo_skills.evaluation.metrics.mrcr_metrics import MRCRMetrics
 
 METRICS_MAP = {
     "math": MathMetrics,
@@ -33,6 +34,7 @@
     "ruler": RulerMetrics,
     "livecodebench": LiveCodeBenchMetrics,
     "scicode": SciCodeMetrics,
+    "mrcr": MRCRMetrics,
 }
 
 

diff --git a/nemo_skills/evaluation/metrics/mrcr_metrics.py b/nemo_skills/evaluation/metrics/mrcr_metrics.py
@@ -0,0 +1,26 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from nemo_skills.evaluation.metrics.base import BaseMetrics
+
+
+class MRCRMetrics(BaseMetrics):
+    """Metrics for MRCR (Multi-Round Coreference) evaluation."""
+
+    def _get_score_dict(self, prediction: dict) -> dict[str, bool | int | float]:
+        return {"accuracy": prediction['seq_match_ratio']}
+
+    def update(self, predictions):
+        super().update(predictions)
+        self._compute_pass_at_k(predictions=predictions)