NVIDIA-NeMo · wasiahmad · Oct 1, 2025 · Aug 7, 2025 · Aug 7, 2025 · Aug 7, 2025
diff --git a/__init__.py b/__init__.py
diff --git a/docs/evaluation/code.md b/docs/evaluation/code.md
@@ -178,6 +178,88 @@ all you need to do is replace `openhands` with `swe_agent` in the command above.
 - Benchmark is defined in [`nemo_skills/dataset/livecodebench/__init__.py`](https://github.com/NVIDIA/NeMo-Skills/blob/main/nemo_skills/dataset/livecodebench/__init__.py)
 - Original benchmark source is [here](https://github.com/LiveCodeBench/LiveCodeBench).
 
+#### Data Preparation
+
+First, prepare the dataset by running the `ns prepare_data` command. The arguments below will generate `test_v6_2408_2505.jsonl`.
+
+```
+ns prepare_data livecodebench --release_version v6 --start_date 2024-08 --end_date 2025-05
+```
+
+##### For Pypy3 Evaluation:
+If you plan to evaluate using the Pypy3 interpreter, you must add the `--keep_all_columns` flag during data preparation. This will download a larger dataset (~1.9GB) containing the necessary test cases. So, we recommend downloading the dataset into a Slurm cluster location.
+
+```
+ns prepare_data livecodebench --release_version v6 --start_date 2024-08 --end_date 2025-05 --keep_all_columns --cluster=<CLUSTER_NAME> --data_dir=<DATA_DIR>
+```
+
+#### Running the Evaluation
+
+Once the data is prepared, you can run the evaluation. Replace `<...>` placeholders with your cluster and directory paths.
+
+##### Standard Python Evaluation
+
+This command runs an evaluation of [OpenReasoning-Nemotron-32B](https://huggingface.co/nvidia/OpenReasoning-Nemotron-32B) on a Slurm cluster.
+
+```
+ns eval \
+    --cluster=<CLUSTER_NAME> \
+    --model=nvidia/OpenReasoning-Nemotron-32B \
+    --server_type=vllm \
+    --server_args="--async-scheduling" \
+    --server_nodes=1 \
+    --server_gpus=8 \
+    --benchmarks=livecodebench \
+    --split=test_v6_2408_2505 \
+    --data_dir=<DATA_DIR> \
+    --output_dir=<OUTPUT_DIR> \
+    --extra_eval_args="++eval_config.interpreter=python" \
+    --with_sandbox \
+    ++inference.temperature=0.6 \
+    ++inference.top_p=0.95 \
+    ++inference.tokens_to_generate=65536
+```
+
+##### Pypy3 Evaluation
+
+To run with the Pypy3 interpreter, modify the `--extra_eval_args` flag as shown below.
+```
+--extra_eval_args="++eval_config.interpreter=pypy3 ++eval_config.test_file=<DATA_DIR>/livecodebench/test_v6_2408_2505.jsonl"
+```
+
+##### Verifying Results
+
+After all jobs are complete, you can check the results in `<OUTPUT_DIR>/eval-results/livecodebench/metrics.json`. You can also take a look at `<OUTPUT_DIR>/eval-results/livecodebench/summarized-results/main_*` They should look something like this:
+
+```
+-------------------------- livecodebench --------------------------
+evaluation_mode | num_entries | avg_tokens | gen_seconds | accuracy
+pass@1          | 454         | 15995      | 2188        | 71.15%
+
+
+------------------------ livecodebench-easy -----------------------
+evaluation_mode | num_entries | avg_tokens | gen_seconds | accuracy
+pass@1          | 110         | 5338       | 1806        | 99.09%
+
+
+------------------------ livecodebench-hard -----------------------
+evaluation_mode | num_entries | avg_tokens | gen_seconds | accuracy
+pass@1          | 203         | 23031      | 2188        | 46.31%
+
+
+----------------------- livecodebench-medium ----------------------
+evaluation_mode | num_entries | avg_tokens | gen_seconds | accuracy
+pass@1          | 141         | 14178      | 1889        | 85.11%
+```
+
+##### Advanced: Averaging Multiple Runs
+
+Due to variance between runs, you can automatically repeat the evaluation and average the results. To run the evaluation 3 times, for example, set the `--benchmarks` flag as follows:
+
+```
+--benchmarks=livecodebench:3
+```
+
 ### livecodebench-pro
 
 - Benchmark is defined in [`nemo_skills/dataset/livecodebench-pro/__init__.py`](https://github.com/NVIDIA/NeMo-Skills/blob/main/nemo_skills/dataset/livecodebench-pro/__init__.py)

diff --git a/nemo_skills/dataset/livecodebench/prepare.py b/nemo_skills/dataset/livecodebench/prepare.py
@@ -18,7 +18,7 @@
 from datetime import datetime
 from pathlib import Path
 
-from datasets import load_dataset
+from datasets import Value, load_dataset
 from dateutil.relativedelta import relativedelta
 
 
@@ -66,7 +66,7 @@ def parse_month_range(start_date, end_date):
         raise ValueError(str(e))
 
 
-def clean_data(dataset):
+def clean_data(dataset, keep_all_columns=False):
     def map_fn(data):
         question = data["question_content"] + "\n\n"
         if data["starter_code"]:
@@ -80,22 +80,26 @@ def map_fn(data):
         data["question"] = question.replace("    ", "\t")
         return data
 
-    remove_columns = [
-        "question_title",
-        "contest_id",
-        "public_test_cases",
-        "private_test_cases",
-        "metadata",
-        "question_content",
-        "platform",
-        "question_id",
-        "starter_code",
-    ]
+    remove_columns = []
+    if not keep_all_columns:
+        remove_columns = [
+            "question_title",
+            "contest_id",
+            "metadata",
+            "question_content",
+            "platform",
+            "question_id",
+            "starter_code",
+            "public_test_cases",
+            "private_test_cases",
+        ]
+    dataset = dataset.cast_column("public_test_cases", Value("large_string"))
+    dataset = dataset.cast_column("private_test_cases", Value("large_string"))
     dataset = dataset.map(map_fn, remove_columns=remove_columns)
     return dataset
 
 
-def prepare(start_date, end_date, release_version, output_dir):
+def prepare(start_date, end_date, release_version, output_dir, keep_all_columns=False):
     start_date, end_date = parse_month_range(start_date, end_date)
     start_yymm = start_date.strftime("%y%m")
     end_yymm = end_date.strftime("%y%m")
@@ -104,7 +108,7 @@ def prepare(start_date, end_date, release_version, output_dir):
     assert release_version in ["v1", "v2", "v3", "v4", "v5", "v6"]
 
     data = parse_data(release_version=f"release_{release_version}")
-    data = clean_data(data)
+    data = clean_data(data, keep_all_columns)
     print("Len of data: ", len(data))
 
     print("Writing to file...")
@@ -115,16 +119,10 @@ def prepare(start_date, end_date, release_version, output_dir):
         for problem in data:
             input_date = datetime.strptime(problem["contest_date"], "%Y-%m-%dT%H:%M:%S").date()
             if start_date <= input_date <= end_date:
-                json.dump(
-                    {
-                        "task_id": problem["task_id"],
-                        "question": problem["question"],
-                        "difficulty": problem["difficulty"],
-                        "subset_for_metrics": problem["difficulty"],
-                        "release_version": release_version,
-                    },
-                    f,
-                )
+                output_record = {**problem}
+                output_record["subset_for_metrics"] = problem["difficulty"]
+                output_record["release_version"] = release_version
+                json.dump(output_record, f)
                 f.write("\n")
 
 
@@ -135,29 +133,29 @@ def prepare(start_date, end_date, release_version, output_dir):
     ("v6", "2024-08", "2025-05"),  # current default in lb
 ]
 
-
 if __name__ == "__main__":
     # Write an argparse to a json file, read it in and parse it
     parser = argparse.ArgumentParser()
     parser.add_argument("--output_dir", type=str, default=str(Path(__file__).parent))
     parser.add_argument("--release_version", type=str, default="all")
     parser.add_argument("--start_date", type=str, default="all", help="End date in YYYY-MM format")
     parser.add_argument("--end_date", type=str, default="all", help="End date in YYYY-MM format")
+    parser.add_argument("--keep_all_columns", action="store_true", help="keep all columns in the output jsonl file")
 
     args = parser.parse_args()
 
     if args.release_version == "all" and args.start_date == "all" and args.end_date == "all":
         # Prepare all splits
         for release_version, start_date, end_date in DEFAULT_SPLITS:
             print(f"Processing data for {release_version} from {start_date} to {end_date}")
-            prepare(start_date, end_date, release_version, args.output_dir)
+            prepare(start_date, end_date, release_version, args.output_dir, args.keep_all_columns)
     else:
         if args.release_version == "all" or args.start_date == "all" or args.end_date == "all":
             raise ValueError(
                 "If preparing a custom split, you must specify all "
                 "--release_version, --start_date, and --end_date arguments."
             )
-        prepare(args.start_date, args.end_date, args.release_version, args.output_dir)
+        prepare(args.start_date, args.end_date, args.release_version, args.output_dir, args.keep_all_columns)
 
     # test_v5_2408_2502.jsonl: 279 samples
     # test_v5_2410_2502.jsonl: 166 samples

diff --git a/nemo_skills/evaluation/evaluator/__init__.py b/nemo_skills/evaluation/evaluator/__init__.py
@@ -21,12 +21,12 @@
     eval_bigcodebench,
     eval_evalplus,
     eval_livebench_coding,
-    eval_livecodebench,
     eval_livecodebench_pro,
 )
 from nemo_skills.evaluation.evaluator.ifbench import eval_ifbench
 from nemo_skills.evaluation.evaluator.ifeval import eval_if
 from nemo_skills.evaluation.evaluator.ioi import eval_ioi
+from nemo_skills.evaluation.evaluator.livecodebench import eval_livecodebench
 from nemo_skills.evaluation.evaluator.math import (
     Lean4ProofEvaluator,
     Lean4StatementEvaluator,

diff --git a/nemo_skills/evaluation/evaluator/code.py b/nemo_skills/evaluation/evaluator/code.py
@@ -23,7 +23,8 @@
 
 from omegaconf import OmegaConf
 
-from nemo_skills.utils import get_logger_name, nested_dataclass, unroll_files
+from nemo_skills.file_utils import unroll_files
+from nemo_skills.utils import get_logger_name
 
 LOG = logging.getLogger(get_logger_name(__file__))
 
@@ -99,86 +100,6 @@ def install_from_git(git_url):
         print(f"Error during installation: {e}")
 
 
-# TODO: use sandbox
-@nested_dataclass(kw_only=True)
-class LiveCodeBenchEvaluatorConfig:
-    language: str = "python"  # "cpp" is another option now
-    test_file: str = None
-
-
-def eval_livecodebench(cfg):
-    try:
-        from livecodebench.evaluate import evaluate
-    except ImportError:
-        LOG.info("Package 'livecodebench' not found. Attempting to install...")
-        # install_from_git("git+https://github.com/wasiahmad/livecodebench.git")
-        install_from_git("git+https://github.com/wasiahmad/livecodebench.git@f285640c20aaf18df1ee5917621a596af4630b5e")
-        try:
-            from livecodebench.evaluate import evaluate
-        except ImportError:
-            LOG.info("Failed to install 'livecodebench'. Please install it manually.")
-            raise
-
-    eval_config = LiveCodeBenchEvaluatorConfig(_init_nested=True, **cfg.eval_config)
-    assert eval_config.language in ["python", "cpp"]
-    if eval_config.language == "cpp":
-        assert eval_config.test_file is not None
-
-    release_version = None
-    for jsonl_file in unroll_files(cfg.input_files):
-        with open(jsonl_file) as f:
-            samples = [preprocess_code(json.loads(line), eval_config.language) for line in f]
-            for sample in samples:
-                sample["question_id"] = sample["task_id"]
-                sample["code_list"] = [sample["completion"]]
-                if release_version is None:
-                    release_version = sample["release_version"]
-                if release_version != sample["release_version"]:
-                    raise ValueError(
-                        f"All samples should have the same release version, "
-                        f"but got {release_version} and {sample['release_version']}"
-                    )
-
-        with open(jsonl_file, "wt", encoding="utf-8") as f:
-            for sample in samples:
-                f.write(json.dumps(sample) + "\n")
-
-        # https://github.com/wasiahmad/livecodebench/blob/main/livecodebench/evaluate.py#L10
-        evaluate(
-            custom_output_file=jsonl_file,
-            release_version=f"release_{release_version}",
-            k_list=[1],
-            language=eval_config.language,
-            test_file=None if eval_config.language == "python" else eval_config.test_file,
-            num_process_evaluate=12,
-            timeout=6 if eval_config.language == "python" else 30,
-        )
-
-        with open(jsonl_file[:-6] + "_eval_results.json", "rt", encoding="utf-8") as fin:
-            eval_grades = json.load(fin)
-        with open(jsonl_file, "wt", encoding="utf-8") as f:
-            for sample in samples:
-                sample["graded_list"] = eval_grades["eval"][sample["task_id"]]["graded_list"]
-                f.write(json.dumps(sample) + "\n")
-
-        # moving eval file to ensure metrics are recomputed
-        shutil.move(jsonl_file[:-6] + "_eval_results.json", jsonl_file[:-6] + "_eval_results-saved.json")
-
-
-def eval_livecodebench_pro(cfg):
-    for jsonl_file in unroll_files(cfg.input_files):
-        with open(jsonl_file) as f:
-            samples = [preprocess_code(json.loads(line), "python") for line in f]
-            for sample in samples:
-                sample["problem_id"] = sample.pop("task_id")
-                sample["text_response"] = sample.pop("completion")
-                sample["response_meta"] = None
-
-        with open(jsonl_file, "wt", encoding="utf-8") as f:
-            for sample in samples:
-                f.write(json.dumps(sample) + "\n")
-
-
 def eval_evalplus(cfg):
     # TODO: need to move it to a separate docker (either our sandbox or separate srun)
     from evalplus.evaluate import evaluate
@@ -228,6 +149,20 @@ def install_requirements(url):
         print(f"Error during installation: {e}")
 
 
+def eval_livecodebench_pro(cfg):
+    for jsonl_file in unroll_files(cfg.input_files):
+        with open(jsonl_file) as f:
+            samples = [preprocess_code(json.loads(line), "python") for line in f]
+            for sample in samples:
+                sample["problem_id"] = sample.pop("task_id")
+                sample["text_response"] = sample.pop("completion")
+                sample["response_meta"] = None
+
+        with open(jsonl_file, "wt", encoding="utf-8") as f:
+            for sample in samples:
+                f.write(json.dumps(sample) + "\n")
+
+
 def eval_livebench_coding(cfg):
     try:
         from livecodebench.evaluate import evaluate