diff --git a/docs/evaluation/code.md b/docs/evaluation/code.md index 1b4a60a14e..68decc1645 100644 --- a/docs/evaluation/code.md +++ b/docs/evaluation/code.md @@ -328,6 +328,38 @@ Due to variance between runs, you can automatically repeat the evaluation and av - Benchmark is defined in [`nemo_skills/dataset/livecodebench-pro/__init__.py`](https://github.com/NVIDIA-NeMo/Skills/blob/main/nemo_skills/dataset/livecodebench-pro/__init__.py) - Original benchmark source is [here](https://github.com/GavinZhengOI/LiveCodeBench-Pro). +#### Data Preparation + +First, prepare the dataset by running the `ns prepare_data` command. The arguments below will generate `test_24q4.jsonl`, `test_25q1.jsonl`, `test_25q2.jsonl`, and `test_25q3.jsonl` files. + +``` +ns prepare_data livecodebench-pro --cluster=local --data_dir=/workspace/ns-data +``` + +Note that, this will also download testcases and keep it at `/workspace/ns-data/livecodebench-pro/testcases`. We recommend using a cluster data location since the testcases directory would be of size 15GB. + +#### Running the Evaluation + +``` +ns eval \ + --cluster= \ + --model=nvidia/OpenReasoning-Nemotron-32B \ + --server_type=vllm \ + --server_args="--async-scheduling" \ + --server_nodes=1 \ + --server_gpus=8 \ + --benchmarks=livecodebench-pro \ + --split=test_25q2 \ + --data_dir=/workspace/ns-data/livecodebench-pro \ + --output_dir= \ + ++parse_reasoning=True \ + ++eval_config.test_file=/workspace/ns-data/livecodebench-pro/test_25q2.jsonl \ + ++eval_config.test_dir=/workspace/ns-data/livecodebench-pro/testcases \ + ++inference.temperature=0.6 \ + ++inference.top_p=0.95 \ + ++inference.tokens_to_generate=65536 +``` + ### human-eval - Benchmark is defined in [`nemo_skills/dataset/human-eval/__init__.py`](https://github.com/NVIDIA-NeMo/Skills/blob/main/nemo_skills/dataset/human-eval/__init__.py) diff --git a/nemo_skills/dataset/livecodebench-pro/__init__.py b/nemo_skills/dataset/livecodebench-pro/__init__.py index a071794767..7aa6e5f0ed 100644 --- a/nemo_skills/dataset/livecodebench-pro/__init__.py +++ b/nemo_skills/dataset/livecodebench-pro/__init__.py @@ -14,5 +14,6 @@ # settings that define how evaluation should be done by default (all can be changed from cmdline) DATASET_GROUP = "code" -METRICS_TYPE = "code" -GENERATION_ARGS = "++prompt_config=eval/livecodebench/python_codegen ++eval_type=livecodebench_pro" +METRICS_TYPE = "livecodebench_pro" +EVAL_SPLIT = "test_25q2" +GENERATION_ARGS = "++prompt_config=eval/livecodebench/cpp_codegen ++eval_type=livecodebench_pro" diff --git a/nemo_skills/dataset/livecodebench-pro/prepare.py b/nemo_skills/dataset/livecodebench-pro/prepare.py index 0f648d0a2e..52703532de 100644 --- a/nemo_skills/dataset/livecodebench-pro/prepare.py +++ b/nemo_skills/dataset/livecodebench-pro/prepare.py @@ -13,19 +13,71 @@ # limitations under the License. import json +import os from pathlib import Path from datasets import load_dataset +from huggingface_hub import snapshot_download + +TESTCASE_REPO = "QAQAQAQAQ/LiveCodeBench-Pro-Testcase" +PROBLEM_REPO = "QAQAQAQAQ/LiveCodeBench-Pro" +DEFAULT_SPLITS = [ + ("24q4", "quater_2024_10_12", 207), + ("25q1", "quater_2025_1_3", 166), + ("25q2", "quater_2025_4_6", 167), + ("25q3", "quater_2025_7_9", 144), +] + + +def download_testcases(local_dir, token): + """ + Downloads the large testcase dataset (~15GB) to the specified directory. + """ + print(f"Downloading testcases from {TESTCASE_REPO} to {local_dir}...") + try: + path = snapshot_download(repo_id=TESTCASE_REPO, repo_type="dataset", local_dir=local_dir, token=token) + print(f"Testcases successfully downloaded to: {path}") + except Exception as e: + print(f"Failed to download testcases: {e}") + raise + + +def process_problem_splits(output_dir, token): + """ + Downloads problem descriptions, converts them to JSONL, and saves them. + """ + print(f"Processing problem splits from {PROBLEM_REPO}...") + + for tag, split, sample_size in DEFAULT_SPLITS: + print(f" - Processing split: {split} -> test_{tag}.jsonl") + + try: + dataset = load_dataset(PROBLEM_REPO, split=split, token=token) + if len(dataset) != sample_size: + print(f" WARNING: Expected {sample_size} samples for {split}, but got {len(dataset)}.") + + output_file = output_dir / f"test_{tag}.jsonl" + + with open(output_file, "w", encoding="utf-8") as f: + for row in dataset: + output_record = dict(row) + output_record["question"] = row["problem_statement"] + output_record["subset_for_metrics"] = row["difficulty"] + + f.write(json.dumps(output_record) + "\n") + + except Exception as e: + print(f" Error processing split {split}: {e}") + if __name__ == "__main__": + hf_token = os.environ.get("HF_TOKEN") + if not hf_token: + print("Error: HF_TOKEN environment variable is required.") + print("Please export it: export HF_TOKEN='hf_...'") + exit(1) + data_dir = Path(__file__).absolute().parent - output_file = str(data_dir / "test.jsonl") - - dataset = load_dataset("anonymous1926/anonymous_dataset") - with open(output_file, "w") as f: - for split_name, split in dataset.items(): - for row in split: - row["task_id"] = row.pop("problem_id") - row["question"] = row.pop("problem_statement") - row["split"] = split_name - f.write(json.dumps(row) + "\n") + testcase_dir = data_dir / "testcases" + download_testcases(local_dir=testcase_dir, token=hf_token) + process_problem_splits(output_dir=data_dir, token=hf_token) diff --git a/nemo_skills/evaluation/evaluator/code.py b/nemo_skills/evaluation/evaluator/code.py index 3800495d28..f24bae422d 100644 --- a/nemo_skills/evaluation/evaluator/code.py +++ b/nemo_skills/evaluation/evaluator/code.py @@ -115,14 +115,14 @@ async def eval_full(self): # type: ignore[override] LOG.info("Full evaluation completed successfully") -def preprocess_code(generation_dict: dict, language="python", strip_whitespace=True): - completion = generation_dict.get("generation", "") or "" +def preprocess_code(generation_dict: dict, language: str = "python", strip_whitespace: bool = True): + completion = generation_dict.get("generation", "") completion = completion.replace("\r", "") # --------------------------------------------------------- # 1. Handle reasoning traces: ... # --------------------------------------------------------- - if "" in completion: + if "" in completion: # partition is faster than regex and avoids imports _, separator, post_thought = completion.partition("") if separator: @@ -194,7 +194,7 @@ def eval_evalplus(cfg): jsonl_file = cfg.input_file with open(jsonl_file) as f: - samples = [preprocess_code(json.loads(line)) for line in f] + samples = [preprocess_code(json.loads(line), language="python") for line in f] # all changes will be done with a new key "completion", so it's ok to write to the same file with open(jsonl_file, "wt", encoding="utf-8") as f: for sample in samples: @@ -236,20 +236,63 @@ def install_requirements(url): print(f"Error during installation: {e}") +@nested_dataclass(kw_only=True) +class LiveCodeBenchProEvaluatorConfig(BaseEvaluatorConfig): + sandbox: dict = field(default_factory=lambda: {"sandbox_type": "local"}) + language: str = "cpp" # use either "python" or "cpp" + test_file: str = None + test_dir: str = None # path to the unit tests directory + timeout: int = 6 + num_processes: int = 12 + + def eval_livecodebench_pro(cfg): - cfg = BaseEvaluatorConfig(**cfg) + cfg = LiveCodeBenchProEvaluatorConfig(**cfg) + try: + from livecodebench.evaluate import evaluate + except ImportError: + LOG.info("Package 'livecodebench' not found. Attempting to install...") + install_from_git("git+https://github.com/wasiahmad/livecodebench.git@livecodebench_pro") + try: + from livecodebench.evaluate import evaluate + except ImportError: + LOG.info("Failed to install 'livecodebench'. Please install it manually.") + raise + jsonl_file = cfg.input_file + samples = [] with open(jsonl_file) as f: - samples = [preprocess_code(json.loads(line), "python") for line in f] - for sample in samples: - sample["problem_id"] = sample.pop("task_id") - sample["text_response"] = sample.pop("completion") - sample["response_meta"] = None + for line in f: + sample = json.loads(line) + sample = preprocess_code(sample, language=cfg.language, strip_whitespace=True) + sample["code_list"] = [sample["completion"]] + samples.append(sample) with open(jsonl_file, "wt", encoding="utf-8") as f: for sample in samples: f.write(json.dumps(sample) + "\n") + evaluate( + custom_output_file=jsonl_file, + language=cfg.language, + test_file=cfg.test_file, + test_dir=cfg.test_dir, + k_list=[1], + num_process_evaluate=cfg.num_processes, + timeout=cfg.timeout, + ) + + with open(jsonl_file[:-6] + "_eval_results.json", "rt", encoding="utf-8") as fin: + eval_grades = json.load(fin) + with open(jsonl_file, "wt", encoding="utf-8") as f: + for sample in samples: + if sample["problem_id"] in eval_grades["eval"]: + sample["graded_list"] = eval_grades["eval"][sample["problem_id"]]["graded_list"] + f.write(json.dumps(sample) + "\n") + + # moving eval file to ensure metrics are recomputed + shutil.move(jsonl_file[:-6] + "_eval_results.json", jsonl_file[:-6] + "_eval_results-saved.json") + def eval_livebench_coding(cfg): cfg = BaseEvaluatorConfig(**cfg) @@ -271,12 +314,12 @@ def eval_livebench_coding(cfg): sample = json.loads(line) if sample["task"] == "coding_completion": assert len(sample["partial_solution"]) > 0 - sample = preprocess_code(sample, strip_whitespace=False) + sample = preprocess_code(sample, language="python", strip_whitespace=False) sample["completion"] = sample["completion"].replace("\t", " ") full_solution = sample["partial_solution"] + "\n" + sample["completion"] sample["code_list"] = [full_solution] else: - sample = preprocess_code(sample, strip_whitespace=True) + sample = preprocess_code(sample, language="python", strip_whitespace=True) sample["code_list"] = [sample["completion"]] samples.append(sample) @@ -332,7 +375,7 @@ def eval_bigcodebench(cfg): samples = [] with open(jsonl_file) as f: for line in f: - generation_dict = preprocess_code(json.loads(line)) + generation_dict = preprocess_code(json.loads(line), language="python") generation_dict["solution"] = generation_dict.pop("completion") samples.append(generation_dict) with open(jsonl_file, "wt", encoding="utf-8") as f: @@ -417,7 +460,7 @@ def postprocess_code(sample): elif data_split != sample["split"]: raise ValueError(f"All samples should have the same split, but got {data_split} and {sample['split']}") - sample = preprocess_code(sample, strip_whitespace=False) + sample = preprocess_code(sample, language="python", strip_whitespace=False) sample["original_completion"] = sample["completion"] sample = postprocess_code(sample) samples.append(sample) diff --git a/nemo_skills/evaluation/metrics/map_metrics.py b/nemo_skills/evaluation/metrics/map_metrics.py index 09842b3220..94f8a4bba4 100644 --- a/nemo_skills/evaluation/metrics/map_metrics.py +++ b/nemo_skills/evaluation/metrics/map_metrics.py @@ -57,6 +57,7 @@ "multichoice": MathMetrics, "ruler": RulerMetrics, "livecodebench": LiveCodeBenchMetrics, + "livecodebench_pro": LiveCodeBenchMetrics, "swe-bench": SweBenchMetrics, "scicode": SciCodeMetrics, "bigcodebench": BigCodeBenchMetrics,