diff --git a/__init__.py b/__init__.py new file mode 100644 index 0000000000..e69de29bb2 diff --git a/docs/evaluation/code.md b/docs/evaluation/code.md index 59e7054af3..b7719b9053 100644 --- a/docs/evaluation/code.md +++ b/docs/evaluation/code.md @@ -178,6 +178,88 @@ all you need to do is replace `openhands` with `swe_agent` in the command above. - Benchmark is defined in [`nemo_skills/dataset/livecodebench/__init__.py`](https://github.com/NVIDIA/NeMo-Skills/blob/main/nemo_skills/dataset/livecodebench/__init__.py) - Original benchmark source is [here](https://github.com/LiveCodeBench/LiveCodeBench). +#### Data Preparation + +First, prepare the dataset by running the `ns prepare_data` command. The arguments below will generate `test_v6_2408_2505.jsonl`. + +``` +ns prepare_data livecodebench --release_version v6 --start_date 2024-08 --end_date 2025-05 +``` + +##### For Pypy3 Evaluation: +If you plan to evaluate using the Pypy3 interpreter, you must add the `--keep_all_columns` flag during data preparation. This will download a larger dataset (~1.9GB) containing the necessary test cases. So, we recommend downloading the dataset into a Slurm cluster location. + +``` +ns prepare_data livecodebench --release_version v6 --start_date 2024-08 --end_date 2025-05 --keep_all_columns --cluster= --data_dir= +``` + +#### Running the Evaluation + +Once the data is prepared, you can run the evaluation. Replace `<...>` placeholders with your cluster and directory paths. + +##### Standard Python Evaluation + +This command runs an evaluation of [OpenReasoning-Nemotron-32B](https://huggingface.co/nvidia/OpenReasoning-Nemotron-32B) on a Slurm cluster. + +``` +ns eval \ + --cluster= \ + --model=nvidia/OpenReasoning-Nemotron-32B \ + --server_type=vllm \ + --server_args="--async-scheduling" \ + --server_nodes=1 \ + --server_gpus=8 \ + --benchmarks=livecodebench \ + --split=test_v6_2408_2505 \ + --data_dir= \ + --output_dir= \ + --extra_eval_args="++eval_config.interpreter=python" \ + --with_sandbox \ + ++inference.temperature=0.6 \ + ++inference.top_p=0.95 \ + ++inference.tokens_to_generate=65536 +``` + +##### Pypy3 Evaluation + +To run with the Pypy3 interpreter, modify the `--extra_eval_args` flag as shown below. +``` +--extra_eval_args="++eval_config.interpreter=pypy3 ++eval_config.test_file=/livecodebench/test_v6_2408_2505.jsonl" +``` + +##### Verifying Results + +After all jobs are complete, you can check the results in `/eval-results/livecodebench/metrics.json`. You can also take a look at `/eval-results/livecodebench/summarized-results/main_*` They should look something like this: + +``` +-------------------------- livecodebench -------------------------- +evaluation_mode | num_entries | avg_tokens | gen_seconds | accuracy +pass@1 | 454 | 15995 | 2188 | 71.15% + + +------------------------ livecodebench-easy ----------------------- +evaluation_mode | num_entries | avg_tokens | gen_seconds | accuracy +pass@1 | 110 | 5338 | 1806 | 99.09% + + +------------------------ livecodebench-hard ----------------------- +evaluation_mode | num_entries | avg_tokens | gen_seconds | accuracy +pass@1 | 203 | 23031 | 2188 | 46.31% + + +----------------------- livecodebench-medium ---------------------- +evaluation_mode | num_entries | avg_tokens | gen_seconds | accuracy +pass@1 | 141 | 14178 | 1889 | 85.11% +``` + +##### Advanced: Averaging Multiple Runs + +Due to variance between runs, you can automatically repeat the evaluation and average the results. To run the evaluation 3 times, for example, set the `--benchmarks` flag as follows: + +``` +--benchmarks=livecodebench:3 +``` + ### livecodebench-pro - Benchmark is defined in [`nemo_skills/dataset/livecodebench-pro/__init__.py`](https://github.com/NVIDIA/NeMo-Skills/blob/main/nemo_skills/dataset/livecodebench-pro/__init__.py) diff --git a/nemo_skills/dataset/livecodebench/prepare.py b/nemo_skills/dataset/livecodebench/prepare.py index 65c1463ee0..40c3210cd9 100644 --- a/nemo_skills/dataset/livecodebench/prepare.py +++ b/nemo_skills/dataset/livecodebench/prepare.py @@ -18,7 +18,7 @@ from datetime import datetime from pathlib import Path -from datasets import load_dataset +from datasets import Value, load_dataset from dateutil.relativedelta import relativedelta @@ -66,7 +66,7 @@ def parse_month_range(start_date, end_date): raise ValueError(str(e)) -def clean_data(dataset): +def clean_data(dataset, keep_all_columns=False): def map_fn(data): question = data["question_content"] + "\n\n" if data["starter_code"]: @@ -80,22 +80,26 @@ def map_fn(data): data["question"] = question.replace(" ", "\t") return data - remove_columns = [ - "question_title", - "contest_id", - "public_test_cases", - "private_test_cases", - "metadata", - "question_content", - "platform", - "question_id", - "starter_code", - ] + remove_columns = [] + if not keep_all_columns: + remove_columns = [ + "question_title", + "contest_id", + "metadata", + "question_content", + "platform", + "question_id", + "starter_code", + "public_test_cases", + "private_test_cases", + ] + dataset = dataset.cast_column("public_test_cases", Value("large_string")) + dataset = dataset.cast_column("private_test_cases", Value("large_string")) dataset = dataset.map(map_fn, remove_columns=remove_columns) return dataset -def prepare(start_date, end_date, release_version, output_dir): +def prepare(start_date, end_date, release_version, output_dir, keep_all_columns=False): start_date, end_date = parse_month_range(start_date, end_date) start_yymm = start_date.strftime("%y%m") end_yymm = end_date.strftime("%y%m") @@ -104,7 +108,7 @@ def prepare(start_date, end_date, release_version, output_dir): assert release_version in ["v1", "v2", "v3", "v4", "v5", "v6"] data = parse_data(release_version=f"release_{release_version}") - data = clean_data(data) + data = clean_data(data, keep_all_columns) print("Len of data: ", len(data)) print("Writing to file...") @@ -115,16 +119,10 @@ def prepare(start_date, end_date, release_version, output_dir): for problem in data: input_date = datetime.strptime(problem["contest_date"], "%Y-%m-%dT%H:%M:%S").date() if start_date <= input_date <= end_date: - json.dump( - { - "task_id": problem["task_id"], - "question": problem["question"], - "difficulty": problem["difficulty"], - "subset_for_metrics": problem["difficulty"], - "release_version": release_version, - }, - f, - ) + output_record = {**problem} + output_record["subset_for_metrics"] = problem["difficulty"] + output_record["release_version"] = release_version + json.dump(output_record, f) f.write("\n") @@ -135,7 +133,6 @@ def prepare(start_date, end_date, release_version, output_dir): ("v6", "2024-08", "2025-05"), # current default in lb ] - if __name__ == "__main__": # Write an argparse to a json file, read it in and parse it parser = argparse.ArgumentParser() @@ -143,6 +140,7 @@ def prepare(start_date, end_date, release_version, output_dir): parser.add_argument("--release_version", type=str, default="all") parser.add_argument("--start_date", type=str, default="all", help="End date in YYYY-MM format") parser.add_argument("--end_date", type=str, default="all", help="End date in YYYY-MM format") + parser.add_argument("--keep_all_columns", action="store_true", help="keep all columns in the output jsonl file") args = parser.parse_args() @@ -150,14 +148,14 @@ def prepare(start_date, end_date, release_version, output_dir): # Prepare all splits for release_version, start_date, end_date in DEFAULT_SPLITS: print(f"Processing data for {release_version} from {start_date} to {end_date}") - prepare(start_date, end_date, release_version, args.output_dir) + prepare(start_date, end_date, release_version, args.output_dir, args.keep_all_columns) else: if args.release_version == "all" or args.start_date == "all" or args.end_date == "all": raise ValueError( "If preparing a custom split, you must specify all " "--release_version, --start_date, and --end_date arguments." ) - prepare(args.start_date, args.end_date, args.release_version, args.output_dir) + prepare(args.start_date, args.end_date, args.release_version, args.output_dir, args.keep_all_columns) # test_v5_2408_2502.jsonl: 279 samples # test_v5_2410_2502.jsonl: 166 samples diff --git a/nemo_skills/evaluation/evaluator/__init__.py b/nemo_skills/evaluation/evaluator/__init__.py index 7bdbfda6ea..70b79fc924 100644 --- a/nemo_skills/evaluation/evaluator/__init__.py +++ b/nemo_skills/evaluation/evaluator/__init__.py @@ -21,12 +21,12 @@ eval_bigcodebench, eval_evalplus, eval_livebench_coding, - eval_livecodebench, eval_livecodebench_pro, ) from nemo_skills.evaluation.evaluator.ifbench import eval_ifbench from nemo_skills.evaluation.evaluator.ifeval import eval_if from nemo_skills.evaluation.evaluator.ioi import eval_ioi +from nemo_skills.evaluation.evaluator.livecodebench import eval_livecodebench from nemo_skills.evaluation.evaluator.math import ( Lean4ProofEvaluator, Lean4StatementEvaluator, diff --git a/nemo_skills/evaluation/evaluator/code.py b/nemo_skills/evaluation/evaluator/code.py index 0c748cf56f..c9aca7bb98 100644 --- a/nemo_skills/evaluation/evaluator/code.py +++ b/nemo_skills/evaluation/evaluator/code.py @@ -23,7 +23,8 @@ from omegaconf import OmegaConf -from nemo_skills.utils import get_logger_name, nested_dataclass, unroll_files +from nemo_skills.file_utils import unroll_files +from nemo_skills.utils import get_logger_name LOG = logging.getLogger(get_logger_name(__file__)) @@ -99,86 +100,6 @@ def install_from_git(git_url): print(f"Error during installation: {e}") -# TODO: use sandbox -@nested_dataclass(kw_only=True) -class LiveCodeBenchEvaluatorConfig: - language: str = "python" # "cpp" is another option now - test_file: str = None - - -def eval_livecodebench(cfg): - try: - from livecodebench.evaluate import evaluate - except ImportError: - LOG.info("Package 'livecodebench' not found. Attempting to install...") - # install_from_git("git+https://github.com/wasiahmad/livecodebench.git") - install_from_git("git+https://github.com/wasiahmad/livecodebench.git@f285640c20aaf18df1ee5917621a596af4630b5e") - try: - from livecodebench.evaluate import evaluate - except ImportError: - LOG.info("Failed to install 'livecodebench'. Please install it manually.") - raise - - eval_config = LiveCodeBenchEvaluatorConfig(_init_nested=True, **cfg.eval_config) - assert eval_config.language in ["python", "cpp"] - if eval_config.language == "cpp": - assert eval_config.test_file is not None - - release_version = None - for jsonl_file in unroll_files(cfg.input_files): - with open(jsonl_file) as f: - samples = [preprocess_code(json.loads(line), eval_config.language) for line in f] - for sample in samples: - sample["question_id"] = sample["task_id"] - sample["code_list"] = [sample["completion"]] - if release_version is None: - release_version = sample["release_version"] - if release_version != sample["release_version"]: - raise ValueError( - f"All samples should have the same release version, " - f"but got {release_version} and {sample['release_version']}" - ) - - with open(jsonl_file, "wt", encoding="utf-8") as f: - for sample in samples: - f.write(json.dumps(sample) + "\n") - - # https://github.com/wasiahmad/livecodebench/blob/main/livecodebench/evaluate.py#L10 - evaluate( - custom_output_file=jsonl_file, - release_version=f"release_{release_version}", - k_list=[1], - language=eval_config.language, - test_file=None if eval_config.language == "python" else eval_config.test_file, - num_process_evaluate=12, - timeout=6 if eval_config.language == "python" else 30, - ) - - with open(jsonl_file[:-6] + "_eval_results.json", "rt", encoding="utf-8") as fin: - eval_grades = json.load(fin) - with open(jsonl_file, "wt", encoding="utf-8") as f: - for sample in samples: - sample["graded_list"] = eval_grades["eval"][sample["task_id"]]["graded_list"] - f.write(json.dumps(sample) + "\n") - - # moving eval file to ensure metrics are recomputed - shutil.move(jsonl_file[:-6] + "_eval_results.json", jsonl_file[:-6] + "_eval_results-saved.json") - - -def eval_livecodebench_pro(cfg): - for jsonl_file in unroll_files(cfg.input_files): - with open(jsonl_file) as f: - samples = [preprocess_code(json.loads(line), "python") for line in f] - for sample in samples: - sample["problem_id"] = sample.pop("task_id") - sample["text_response"] = sample.pop("completion") - sample["response_meta"] = None - - with open(jsonl_file, "wt", encoding="utf-8") as f: - for sample in samples: - f.write(json.dumps(sample) + "\n") - - def eval_evalplus(cfg): # TODO: need to move it to a separate docker (either our sandbox or separate srun) from evalplus.evaluate import evaluate @@ -228,6 +149,20 @@ def install_requirements(url): print(f"Error during installation: {e}") +def eval_livecodebench_pro(cfg): + for jsonl_file in unroll_files(cfg.input_files): + with open(jsonl_file) as f: + samples = [preprocess_code(json.loads(line), "python") for line in f] + for sample in samples: + sample["problem_id"] = sample.pop("task_id") + sample["text_response"] = sample.pop("completion") + sample["response_meta"] = None + + with open(jsonl_file, "wt", encoding="utf-8") as f: + for sample in samples: + f.write(json.dumps(sample) + "\n") + + def eval_livebench_coding(cfg): try: from livecodebench.evaluate import evaluate diff --git a/nemo_skills/evaluation/evaluator/livecodebench.py b/nemo_skills/evaluation/evaluator/livecodebench.py new file mode 100644 index 0000000000..770f3d0340 --- /dev/null +++ b/nemo_skills/evaluation/evaluator/livecodebench.py @@ -0,0 +1,129 @@ +import asyncio +import json +import logging +import shlex +import shutil +import textwrap +from contextlib import asynccontextmanager +from dataclasses import field + +from nemo_skills.code_execution.sandbox import get_sandbox +from nemo_skills.evaluation.evaluator.code import preprocess_code +from nemo_skills.utils import get_logger_name, nested_dataclass, unroll_files + +LOG = logging.getLogger(get_logger_name(__file__)) + +LIVECODEBENCH_PYTHON_GIT_URL = "git+https://github.com/wasiahmad/livecodebench.git@livecodebench" +LIVECODEBENCH_PYPY3_GIT_URL = "git+https://github.com/wasiahmad/livecodebench.git" + + +@nested_dataclass(kw_only=True) +class LiveCodeBenchEvaluatorConfig: + sandbox: dict = field(default_factory=lambda: {"sandbox_type": "local"}) + language: str = "python" # "cpp" is another option now + test_file: str = None + interpreter: str = "python" # use either "python" or pypy3 + timeout: int = 6 + num_processes: int = 12 + + +@asynccontextmanager +async def sandbox_context(config: dict): + sandbox = get_sandbox(**config) + try: + yield sandbox + finally: + LOG.info("Closing sandbox...") + await sandbox.close() + + +async def install_packages(eval_config: LiveCodeBenchEvaluatorConfig) -> bool: + """ + Installs required packages in a temporary sandbox. + Returns True on success, False on failure. + """ + async with sandbox_context(eval_config.sandbox) as sandbox: + LOG.info(f"Installing livecodebench with {eval_config.interpreter}...") + pip_cmd = "pip" if eval_config.interpreter == "python" else "pypy3 -m pip" + git_url = LIVECODEBENCH_PYTHON_GIT_URL if eval_config.interpreter == "python" else LIVECODEBENCH_PYPY3_GIT_URL + cmd = f"{pip_cmd} install {git_url}" + + result, _ = await sandbox.execute_code(cmd, language="shell", timeout=300) + if result.get("process_status") != "completed": + LOG.warning(f"Failed to install livecodebench: {result.get('stderr', 'Unknown error')}") + return False + + LOG.info("Successfully installed livecodebench.") + return True + + +async def eval_livecodebench_async(cfg): + eval_config = LiveCodeBenchEvaluatorConfig(_init_nested=True, **cfg.eval_config) + + if eval_config.language == "python" and eval_config.interpreter not in ["python", "pypy3"]: + raise ValueError("Python interpreter must be 'python' or 'pypy3'.") + if eval_config.language == "cpp" and eval_config.test_file is None: + raise ValueError("C++ evaluation requires a test_file.") + + if not await install_packages(eval_config): + return + + async with sandbox_context(eval_config.sandbox) as sandbox: + for jsonl_file in unroll_files(cfg.input_files): + LOG.info(f"Processing file: {jsonl_file}") + + with open(jsonl_file, encoding="utf-8") as f_in: + samples = [preprocess_code(json.loads(line), eval_config.language) for line in f_in] + + versions = {s["release_version"] for s in samples} + if len(versions) > 1: + raise ValueError(f"All samples should have the same release version. Found: {versions}") + release_version = versions.pop() + + for s in samples: + s["code_list"] = [s["completion"]] + + with open(jsonl_file, "w", encoding="utf-8") as f_out: + f_out.writelines(json.dumps(sample) + "\n" for sample in samples) + + test_file_arg = repr(eval_config.test_file) if eval_config.test_file else "None" + eval_code = textwrap.dedent(f""" + from livecodebench.evaluate import evaluate + evaluate( + custom_output_file='{jsonl_file}', + release_version='release_{release_version}', + test_file={test_file_arg}, + k_list=[1], + language='{eval_config.language}', + num_process_evaluate={eval_config.num_processes}, + timeout={eval_config.timeout} + ) + """) + + cmd = f"{eval_config.interpreter} -c {shlex.quote(eval_code)}" + output, _ = await sandbox.execute_code( + cmd, + language="shell", + timeout=eval_config.timeout * len(samples) + 60, + max_output_characters=100_000, + ) + + if output.get("process_status") != "completed": + LOG.error(f"Evaluation failed for {jsonl_file}. Stderr: {output.get('stderr')}") + continue + + with open(jsonl_file[:-6] + "_eval_results.json", "rt", encoding="utf-8") as fin: + eval_grades = json.load(fin) + + with open(jsonl_file, "wt", encoding="utf-8") as f_out: + for s in samples: + s["graded_list"] = eval_grades["eval"][s["task_id"]]["graded_list"] + f_out.write(json.dumps(s) + "\n") + + shutil.move(jsonl_file[:-6] + "_eval_results.json", jsonl_file[:-6] + "_eval_results-saved.json") + LOG.info(f"Finished processing {jsonl_file}, results saved.") + + +def eval_livecodebench(cfg): + """Synchronous wrapper to run the async evaluation.""" + asyncio.run(eval_livecodebench_async(cfg))