NVIDIA-NeMo · Kipok · Sep 18, 2025 · Sep 18, 2025 · Sep 18, 2025 · Sep 18, 2025
diff --git a/docs/evaluation/code.md b/docs/evaluation/code.md
@@ -196,4 +196,9 @@ all you need to do is replace `openhands` with `swe_agent` in the command above.
 ### bigcodebench
 
 - Benchmark is defined in [`nemo_skills/dataset/bigcodebench/__init__.py`](https://github.com/NVIDIA/NeMo-Skills/blob/main/nemo_skills/dataset/bigcodebench/__init__.py)
-- Original benchmark source is [here](https://github.com/bigcode-project/bigcodebench).
+- Original benchmark source is [here](https://github.com/bigcode-project/bigcodebench).
+
+### livebench-coding
+
+- Benchmark is defined in [`nemo_skills/dataset/livebench-coding/__init__.py`](https://github.com/NVIDIA/NeMo-Skills/blob/main/nemo_skills/dataset/livebench-coding/__init__.py)
+- Original benchmark source is [here](https://huggingface.co/datasets/livebench/coding).
diff --git a/nemo_skills/dataset/livebench-coding/__init__.py b/nemo_skills/dataset/livebench-coding/__init__.py
@@ -0,0 +1,20 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# settings that define how evaluation should be done by default (all can be changed from cmdline)
+DATASET_GROUP = "code"
+METRICS_TYPE = "livecodebench"
+EVAL_SPLIT = "test"
+EVAL_ARGS = "++eval_type=livebench_coding"
+GENERATION_ARGS = "++prompt_config=generic/default"
diff --git a/nemo_skills/dataset/livebench-coding/prepare.py b/nemo_skills/dataset/livebench-coding/prepare.py
@@ -0,0 +1,73 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import argparse
+import json
+import os
+from pathlib import Path
+
+from datasets import load_dataset
+
+
+def parse_data():
+    data = load_dataset("livebench/coding", split="test", trust_remote_code=True)
+    # Dataset({
+    #     features: ['question_id', 'category', 'turns', 'question_title', 'public_test_cases', 'private_test_cases', 'original_json', 'release_date', 'citation', 'task', 'livebench_release_date', 'livebench_removal_date', 'remainder', 'solution', 'partial_solution'],
+    #     num_rows: 128
+    # })
+    return data
+
+
+def clean_data(dataset):
+    def map_fn(data):
+        question = data["turns"][0]
+        data["question"] = question.replace("    ", "\t")
+        return data
+
+    remove_columns = [
+        "category",
+        "turns",
+        "question_title",
+        "public_test_cases",
+        "private_test_cases",
+        "original_json",
+        "release_date",
+        "citation",
+        "livebench_release_date",
+        "livebench_removal_date",
+        "remainder",
+        "solution",
+    ]
+    dataset = dataset.map(map_fn, remove_columns=remove_columns)
+    return dataset
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--output_dir", type=str, default=str(Path(__file__).parent))
+    args = parser.parse_args()
+
+    data = parse_data()
+    data = clean_data(data)
+    print("Len of data: ", len(data))
+
+    print("Writing to file...")
+    if not os.path.exists(args.output_dir):
+        os.makedirs(args.output_dir)
+
+    output_file_path = os.path.join(args.output_dir, "test.jsonl")
+    with open(output_file_path, "w") as f:
+        for problem in data:
+            json.dump(problem, f)
+            f.write("\n")
diff --git a/nemo_skills/evaluation/evaluator/__init__.py b/nemo_skills/evaluation/evaluator/__init__.py
@@ -18,6 +18,7 @@
 from nemo_skills.evaluation.evaluator.code import (
     eval_bigcodebench,
     eval_evalplus,
+    eval_livebench_coding,
     eval_livecodebench,
     eval_livecodebench_pro,
 )
@@ -51,6 +52,7 @@ def dummy_eval(cfg):
     "multichoice": eval_mcq,
     "ruler": eval_ruler,
     "livecodebench": eval_livecodebench,
+    "livebench_coding": eval_livebench_coding,
     "livecodebench_pro": eval_livecodebench_pro,
     "scicode": eval_scicode,
     "mrcr": eval_mrcr,

diff --git a/nemo_skills/evaluation/evaluator/code.py b/nemo_skills/evaluation/evaluator/code.py
@@ -32,9 +32,10 @@
 )
 
 
-def preprocess_code(generation_dict: dict, language="python"):
+def preprocess_code(generation_dict: dict, language="python", strip_whitespace=True):
     completion = generation_dict["generation"]
-    completion = completion.strip()
+    if strip_whitespace:
+        completion = completion.strip()
     completion = completion.replace("\r", "")
 
     ##### To handle code generation by reasoning models
@@ -57,25 +58,33 @@ def preprocess_code(generation_dict: dict, language="python"):
 
     if start_with_lang_tag in completion:
         def_line = completion.index(start_with_lang_tag) + len(start_with_lang_tag)
-        completion = completion[def_line:].strip()
+        completion = completion[def_line:]
+        if strip_whitespace:
+            completion = completion.strip()
         try:
             next_line = completion.index(generic_start_end_tag)
-            completion = completion[:next_line].strip()
+            completion = completion[:next_line]
+            if strip_whitespace:
+                completion = completion.strip()
         except Exception:
             print(completion)
             print("================\n")
 
     elif generic_start_end_tag in completion:
         def_line = completion.index(generic_start_end_tag) + len(generic_start_end_tag)
-        completion = completion[def_line:].strip()
+        completion = completion[def_line:]
+        if strip_whitespace:
+            completion = completion.strip()
         try:
             next_line = completion.index(generic_start_end_tag)
-            completion = completion[:next_line].strip()
+            completion = completion[:next_line]
+            if strip_whitespace:
+                completion = completion.strip()
         except Exception:
             print(completion)
             print("================\n")
 
-    if completion.startswith(" "):
+    if completion.startswith(" ") and strip_whitespace:
         completion = completion.strip()
 
     generation_dict["completion"] = completion
@@ -219,6 +228,57 @@ def install_requirements(url):
         print(f"Error during installation: {e}")
 
 
+def eval_livebench_coding(cfg):
+    try:
+        from livecodebench.evaluate import evaluate
+    except ImportError:
+        LOG.info("Package 'livecodebench' not found. Attempting to install...")
+        install_from_git("git+https://github.com/wasiahmad/livecodebench.git@livebench")
+        try:
+            from livecodebench.evaluate import evaluate
+        except ImportError:
+            LOG.info("Failed to install 'livecodebench'. Please install it manually.")
+            raise
+
+    for jsonl_file in unroll_files(cfg.input_files):
+        samples = []
+        with open(jsonl_file) as f:
+            for line in f:
+                sample = json.loads(line)
+                if sample["task"] == "coding_completion":
+                    assert len(sample["partial_solution"]) > 0
+                    sample = preprocess_code(sample, strip_whitespace=False)
+                    sample["completion"] = sample["completion"].replace("\t", "    ")
+                    full_solution = sample["partial_solution"] + "\n" + sample["completion"]
+                    sample["code_list"] = [full_solution]
+                else:
+                    sample = preprocess_code(sample, strip_whitespace=True)
+                    sample["code_list"] = [sample["completion"]]
+
+                samples.append(sample)
+
+        with open(jsonl_file, "wt", encoding="utf-8") as f:
+            for sample in samples:
+                f.write(json.dumps(sample) + "\n")
+
+        evaluate(
+            custom_output_file=jsonl_file,
+            k_list=[1],
+            num_process_evaluate=12,
+            timeout=6,
+        )
+
+        with open(jsonl_file[:-6] + "_eval_results.json", "rt", encoding="utf-8") as fin:
+            eval_grades = json.load(fin)
+        with open(jsonl_file, "wt", encoding="utf-8") as f:
+            for sample in samples:
+                sample["graded_list"] = eval_grades["eval"][sample["question_id"]]["graded_list"]
+                f.write(json.dumps(sample) + "\n")
+
+        # moving eval file to ensure metrics are recomputed
+        shutil.move(jsonl_file[:-6] + "_eval_results.json", jsonl_file[:-6] + "_eval_results-saved.json")
+
+
 def install_or_upgrade_package(package_name):
     try:
         # Run the pip command to install or upgrade the package

diff --git a/nemo_skills/evaluation/metrics/map_metrics.py b/nemo_skills/evaluation/metrics/map_metrics.py
@@ -48,6 +48,7 @@
     "scicode": SciCodeMetrics,
     "bigcodebench": BigCodeBenchMetrics,
     "mrcr": MRCRMetrics,
+    "livebench_coding": LiveCodeBenchMetrics,
 }