NVIDIA-NeMo · wasiahmad · Dec 19, 2025 · Dec 5, 2025 · Dec 5, 2025 · Dec 16, 2025
diff --git a/docs/evaluation/code.md b/docs/evaluation/code.md
@@ -328,6 +328,38 @@ Due to variance between runs, you can automatically repeat the evaluation and av
 - Benchmark is defined in [`nemo_skills/dataset/livecodebench-pro/__init__.py`](https://github.com/NVIDIA-NeMo/Skills/blob/main/nemo_skills/dataset/livecodebench-pro/__init__.py)
 - Original benchmark source is [here](https://github.com/GavinZhengOI/LiveCodeBench-Pro).
 
+#### Data Preparation
+
+First, prepare the dataset by running the `ns prepare_data` command. The arguments below will generate `test_24q4.jsonl`, `test_25q1.jsonl`, `test_25q2.jsonl`, and `test_25q3.jsonl` files.
+
+```
+ns prepare_data livecodebench-pro --cluster=local --data_dir=/workspace/ns-data
+```
+
+Note that, this will also download testcases and keep it at `/workspace/ns-data/livecodebench-pro/testcases`. We recommend using a cluster data location since the testcases directory would be of size 15GB.
+
+#### Running the Evaluation
+
+```
+ns eval \
+    --cluster=<CLUSTER_NAME> \
+    --model=nvidia/OpenReasoning-Nemotron-32B \
+    --server_type=vllm \
+    --server_args="--async-scheduling" \
+    --server_nodes=1 \
+    --server_gpus=8 \
+    --benchmarks=livecodebench-pro \
+    --split=test_25q2 \
+    --data_dir=/workspace/ns-data/livecodebench-pro \
+    --output_dir=<OUTPUT_DIR> \
+    ++parse_reasoning=True \
+    ++eval_config.test_file=/workspace/ns-data/livecodebench-pro/test_25q2.jsonl \
+    ++eval_config.test_dir=/workspace/ns-data/livecodebench-pro/testcases \
+    ++inference.temperature=0.6 \
+    ++inference.top_p=0.95 \
+    ++inference.tokens_to_generate=65536
+```
+
 ### human-eval
 
 - Benchmark is defined in [`nemo_skills/dataset/human-eval/__init__.py`](https://github.com/NVIDIA-NeMo/Skills/blob/main/nemo_skills/dataset/human-eval/__init__.py)

diff --git a/nemo_skills/dataset/livecodebench-pro/__init__.py b/nemo_skills/dataset/livecodebench-pro/__init__.py
@@ -14,5 +14,6 @@
 
 # settings that define how evaluation should be done by default (all can be changed from cmdline)
 DATASET_GROUP = "code"
-METRICS_TYPE = "code"
-GENERATION_ARGS = "++prompt_config=eval/livecodebench/python_codegen ++eval_type=livecodebench_pro"
+METRICS_TYPE = "livecodebench_pro"
+EVAL_SPLIT = "test_25q2"
+GENERATION_ARGS = "++prompt_config=eval/livecodebench/cpp_codegen ++eval_type=livecodebench_pro"
diff --git a/nemo_skills/dataset/livecodebench-pro/prepare.py b/nemo_skills/dataset/livecodebench-pro/prepare.py
@@ -13,19 +13,71 @@
 # limitations under the License.
 
 import json
+import os
 from pathlib import Path
 
 from datasets import load_dataset
+from huggingface_hub import snapshot_download
+
+TESTCASE_REPO = "QAQAQAQAQ/LiveCodeBench-Pro-Testcase"
+PROBLEM_REPO = "QAQAQAQAQ/LiveCodeBench-Pro"
+DEFAULT_SPLITS = [
+    ("24q4", "quater_2024_10_12", 207),
+    ("25q1", "quater_2025_1_3", 166),
+    ("25q2", "quater_2025_4_6", 167),
+    ("25q3", "quater_2025_7_9", 144),
+]
+
+
+def download_testcases(local_dir, token):
+    """
+    Downloads the large testcase dataset (~15GB) to the specified directory.
+    """
+    print(f"Downloading testcases from {TESTCASE_REPO} to {local_dir}...")
+    try:
+        path = snapshot_download(repo_id=TESTCASE_REPO, repo_type="dataset", local_dir=local_dir, token=token)
+        print(f"Testcases successfully downloaded to: {path}")
+    except Exception as e:
+        print(f"Failed to download testcases: {e}")
+        raise
+
+
+def process_problem_splits(output_dir, token):
+    """
+    Downloads problem descriptions, converts them to JSONL, and saves them.
+    """
+    print(f"Processing problem splits from {PROBLEM_REPO}...")
+
+    for tag, split, sample_size in DEFAULT_SPLITS:
+        print(f"  - Processing split: {split} -> test_{tag}.jsonl")
+
+        try:
+            dataset = load_dataset(PROBLEM_REPO, split=split, token=token)
+            if len(dataset) != sample_size:
+                print(f"    WARNING: Expected {sample_size} samples for {split}, but got {len(dataset)}.")
+
+            output_file = output_dir / f"test_{tag}.jsonl"
+
+            with open(output_file, "w", encoding="utf-8") as f:
+                for row in dataset:
+                    output_record = dict(row)
+                    output_record["question"] = row["problem_statement"]
+                    output_record["subset_for_metrics"] = row["difficulty"]
+
+                    f.write(json.dumps(output_record) + "\n")
+
+        except Exception as e:
+            print(f"    Error processing split {split}: {e}")
+
 
 if __name__ == "__main__":
+    hf_token = os.environ.get("HF_TOKEN")
+    if not hf_token:
+        print("Error: HF_TOKEN environment variable is required.")
+        print("Please export it: export HF_TOKEN='hf_...'")
+        exit(1)
+
     data_dir = Path(__file__).absolute().parent
-    output_file = str(data_dir / "test.jsonl")
-
-    dataset = load_dataset("anonymous1926/anonymous_dataset")
-    with open(output_file, "w") as f:
-        for split_name, split in dataset.items():
-            for row in split:
-                row["task_id"] = row.pop("problem_id")
-                row["question"] = row.pop("problem_statement")
-                row["split"] = split_name
-                f.write(json.dumps(row) + "\n")
+    testcase_dir = data_dir / "testcases"
+    download_testcases(local_dir=testcase_dir, token=hf_token)
+    process_problem_splits(output_dir=data_dir, token=hf_token)
diff --git a/nemo_skills/evaluation/evaluator/code.py b/nemo_skills/evaluation/evaluator/code.py
@@ -115,14 +115,14 @@ async def eval_full(self):  # type: ignore[override]
         LOG.info("Full evaluation completed successfully")
 
 
-def preprocess_code(generation_dict: dict, language="python", strip_whitespace=True):
-    completion = generation_dict.get("generation", "") or ""
+def preprocess_code(generation_dict: dict, language: str = "python", strip_whitespace: bool = True):
+    completion = generation_dict.get("generation", "")
     completion = completion.replace("\r", "")
 
     # ---------------------------------------------------------
     # 1. Handle reasoning traces: <think>...</think>
     # ---------------------------------------------------------
-    if "<think>" in completion:
+    if "</think>" in completion:
         # partition is faster than regex and avoids imports
         _, separator, post_thought = completion.partition("</think>")
         if separator:
@@ -194,7 +194,7 @@ def eval_evalplus(cfg):
 
     jsonl_file = cfg.input_file
     with open(jsonl_file) as f:
-        samples = [preprocess_code(json.loads(line)) for line in f]
+        samples = [preprocess_code(json.loads(line), language="python") for line in f]
     # all changes will be done with a new key "completion", so it's ok to write to the same file
     with open(jsonl_file, "wt", encoding="utf-8") as f:
         for sample in samples:
@@ -236,20 +236,63 @@ def install_requirements(url):
         print(f"Error during installation: {e}")
 
 
+@nested_dataclass(kw_only=True)
+class LiveCodeBenchProEvaluatorConfig(BaseEvaluatorConfig):
+    sandbox: dict = field(default_factory=lambda: {"sandbox_type": "local"})
+    language: str = "cpp"  # use either "python" or "cpp"
+    test_file: str = None
+    test_dir: str = None  # path to the unit tests directory
+    timeout: int = 6
+    num_processes: int = 12
+
+
 def eval_livecodebench_pro(cfg):
-    cfg = BaseEvaluatorConfig(**cfg)
+    cfg = LiveCodeBenchProEvaluatorConfig(**cfg)
+    try:
+        from livecodebench.evaluate import evaluate
+    except ImportError:
+        LOG.info("Package 'livecodebench' not found. Attempting to install...")
+        install_from_git("git+https://github.com/wasiahmad/livecodebench.git@livecodebench_pro")
+        try:
+            from livecodebench.evaluate import evaluate
+        except ImportError:
+            LOG.info("Failed to install 'livecodebench'. Please install it manually.")
+            raise
+
     jsonl_file = cfg.input_file
+    samples = []
     with open(jsonl_file) as f:
-        samples = [preprocess_code(json.loads(line), "python") for line in f]
-        for sample in samples:
-            sample["problem_id"] = sample.pop("task_id")
-            sample["text_response"] = sample.pop("completion")
-            sample["response_meta"] = None
+        for line in f:
+            sample = json.loads(line)
+            sample = preprocess_code(sample, language=cfg.language, strip_whitespace=True)
+            sample["code_list"] = [sample["completion"]]
+            samples.append(sample)
 
     with open(jsonl_file, "wt", encoding="utf-8") as f:
         for sample in samples:
             f.write(json.dumps(sample) + "\n")
 
+    evaluate(
+        custom_output_file=jsonl_file,
+        language=cfg.language,
+        test_file=cfg.test_file,
+        test_dir=cfg.test_dir,
+        k_list=[1],
+        num_process_evaluate=cfg.num_processes,
+        timeout=cfg.timeout,
+    )
+
+    with open(jsonl_file[:-6] + "_eval_results.json", "rt", encoding="utf-8") as fin:
+        eval_grades = json.load(fin)
+    with open(jsonl_file, "wt", encoding="utf-8") as f:
+        for sample in samples:
+            if sample["problem_id"] in eval_grades["eval"]:
+                sample["graded_list"] = eval_grades["eval"][sample["problem_id"]]["graded_list"]
+                f.write(json.dumps(sample) + "\n")
+
+    # moving eval file to ensure metrics are recomputed
+    shutil.move(jsonl_file[:-6] + "_eval_results.json", jsonl_file[:-6] + "_eval_results-saved.json")
+
 
 def eval_livebench_coding(cfg):
     cfg = BaseEvaluatorConfig(**cfg)
@@ -271,12 +314,12 @@ def eval_livebench_coding(cfg):
             sample = json.loads(line)
             if sample["task"] == "coding_completion":
                 assert len(sample["partial_solution"]) > 0
-                sample = preprocess_code(sample, strip_whitespace=False)
+                sample = preprocess_code(sample, language="python", strip_whitespace=False)
                 sample["completion"] = sample["completion"].replace("\t", "    ")
                 full_solution = sample["partial_solution"] + "\n" + sample["completion"]
                 sample["code_list"] = [full_solution]
             else:
-                sample = preprocess_code(sample, strip_whitespace=True)
+                sample = preprocess_code(sample, language="python", strip_whitespace=True)
                 sample["code_list"] = [sample["completion"]]
 
             samples.append(sample)
@@ -332,7 +375,7 @@ def eval_bigcodebench(cfg):
     samples = []
     with open(jsonl_file) as f:
         for line in f:
-            generation_dict = preprocess_code(json.loads(line))
+            generation_dict = preprocess_code(json.loads(line), language="python")
             generation_dict["solution"] = generation_dict.pop("completion")
             samples.append(generation_dict)
     with open(jsonl_file, "wt", encoding="utf-8") as f:
@@ -417,7 +460,7 @@ def postprocess_code(sample):
             elif data_split != sample["split"]:
                 raise ValueError(f"All samples should have the same split, but got {data_split} and {sample['split']}")
 
-            sample = preprocess_code(sample, strip_whitespace=False)
+            sample = preprocess_code(sample, language="python", strip_whitespace=False)
             sample["original_completion"] = sample["completion"]
             sample = postprocess_code(sample)
             samples.append(sample)

diff --git a/nemo_skills/evaluation/metrics/map_metrics.py b/nemo_skills/evaluation/metrics/map_metrics.py
@@ -57,6 +57,7 @@
     "multichoice": MathMetrics,
     "ruler": RulerMetrics,
     "livecodebench": LiveCodeBenchMetrics,
+    "livecodebench_pro": LiveCodeBenchMetrics,
     "swe-bench": SweBenchMetrics,
     "scicode": SciCodeMetrics,
     "bigcodebench": BigCodeBenchMetrics,