diff --git a/docs/evaluation/code.md b/docs/evaluation/code.md index eda939427d..596b133b75 100644 --- a/docs/evaluation/code.md +++ b/docs/evaluation/code.md @@ -196,4 +196,9 @@ all you need to do is replace `openhands` with `swe_agent` in the command above. ### bigcodebench - Benchmark is defined in [`nemo_skills/dataset/bigcodebench/__init__.py`](https://github.com/NVIDIA/NeMo-Skills/blob/main/nemo_skills/dataset/bigcodebench/__init__.py) -- Original benchmark source is [here](https://github.com/bigcode-project/bigcodebench). \ No newline at end of file +- Original benchmark source is [here](https://github.com/bigcode-project/bigcodebench). + +### livebench-coding + +- Benchmark is defined in [`nemo_skills/dataset/livebench-coding/__init__.py`](https://github.com/NVIDIA/NeMo-Skills/blob/main/nemo_skills/dataset/livebench-coding/__init__.py) +- Original benchmark source is [here](https://huggingface.co/datasets/livebench/coding). \ No newline at end of file diff --git a/nemo_skills/dataset/livebench-coding/__init__.py b/nemo_skills/dataset/livebench-coding/__init__.py new file mode 100644 index 0000000000..fc8fd66822 --- /dev/null +++ b/nemo_skills/dataset/livebench-coding/__init__.py @@ -0,0 +1,20 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# settings that define how evaluation should be done by default (all can be changed from cmdline) +DATASET_GROUP = "code" +METRICS_TYPE = "livecodebench" +EVAL_SPLIT = "test" +EVAL_ARGS = "++eval_type=livebench_coding" +GENERATION_ARGS = "++prompt_config=generic/default" diff --git a/nemo_skills/dataset/livebench-coding/prepare.py b/nemo_skills/dataset/livebench-coding/prepare.py new file mode 100644 index 0000000000..14deee2cf7 --- /dev/null +++ b/nemo_skills/dataset/livebench-coding/prepare.py @@ -0,0 +1,73 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import argparse +import json +import os +from pathlib import Path + +from datasets import load_dataset + + +def parse_data(): + data = load_dataset("livebench/coding", split="test", trust_remote_code=True) + # Dataset({ + # features: ['question_id', 'category', 'turns', 'question_title', 'public_test_cases', 'private_test_cases', 'original_json', 'release_date', 'citation', 'task', 'livebench_release_date', 'livebench_removal_date', 'remainder', 'solution', 'partial_solution'], + # num_rows: 128 + # }) + return data + + +def clean_data(dataset): + def map_fn(data): + question = data["turns"][0] + data["question"] = question.replace(" ", "\t") + return data + + remove_columns = [ + "category", + "turns", + "question_title", + "public_test_cases", + "private_test_cases", + "original_json", + "release_date", + "citation", + "livebench_release_date", + "livebench_removal_date", + "remainder", + "solution", + ] + dataset = dataset.map(map_fn, remove_columns=remove_columns) + return dataset + + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + parser.add_argument("--output_dir", type=str, default=str(Path(__file__).parent)) + args = parser.parse_args() + + data = parse_data() + data = clean_data(data) + print("Len of data: ", len(data)) + + print("Writing to file...") + if not os.path.exists(args.output_dir): + os.makedirs(args.output_dir) + + output_file_path = os.path.join(args.output_dir, "test.jsonl") + with open(output_file_path, "w") as f: + for problem in data: + json.dump(problem, f) + f.write("\n") diff --git a/nemo_skills/evaluation/evaluator/__init__.py b/nemo_skills/evaluation/evaluator/__init__.py index 9feacea647..44b76a7928 100644 --- a/nemo_skills/evaluation/evaluator/__init__.py +++ b/nemo_skills/evaluation/evaluator/__init__.py @@ -18,6 +18,7 @@ from nemo_skills.evaluation.evaluator.code import ( eval_bigcodebench, eval_evalplus, + eval_livebench_coding, eval_livecodebench, eval_livecodebench_pro, ) @@ -51,6 +52,7 @@ def dummy_eval(cfg): "multichoice": eval_mcq, "ruler": eval_ruler, "livecodebench": eval_livecodebench, + "livebench_coding": eval_livebench_coding, "livecodebench_pro": eval_livecodebench_pro, "scicode": eval_scicode, "mrcr": eval_mrcr, diff --git a/nemo_skills/evaluation/evaluator/code.py b/nemo_skills/evaluation/evaluator/code.py index 184fe177ca..0c748cf56f 100644 --- a/nemo_skills/evaluation/evaluator/code.py +++ b/nemo_skills/evaluation/evaluator/code.py @@ -32,9 +32,10 @@ ) -def preprocess_code(generation_dict: dict, language="python"): +def preprocess_code(generation_dict: dict, language="python", strip_whitespace=True): completion = generation_dict["generation"] - completion = completion.strip() + if strip_whitespace: + completion = completion.strip() completion = completion.replace("\r", "") ##### To handle code generation by reasoning models @@ -57,25 +58,33 @@ def preprocess_code(generation_dict: dict, language="python"): if start_with_lang_tag in completion: def_line = completion.index(start_with_lang_tag) + len(start_with_lang_tag) - completion = completion[def_line:].strip() + completion = completion[def_line:] + if strip_whitespace: + completion = completion.strip() try: next_line = completion.index(generic_start_end_tag) - completion = completion[:next_line].strip() + completion = completion[:next_line] + if strip_whitespace: + completion = completion.strip() except Exception: print(completion) print("================\n") elif generic_start_end_tag in completion: def_line = completion.index(generic_start_end_tag) + len(generic_start_end_tag) - completion = completion[def_line:].strip() + completion = completion[def_line:] + if strip_whitespace: + completion = completion.strip() try: next_line = completion.index(generic_start_end_tag) - completion = completion[:next_line].strip() + completion = completion[:next_line] + if strip_whitespace: + completion = completion.strip() except Exception: print(completion) print("================\n") - if completion.startswith(" "): + if completion.startswith(" ") and strip_whitespace: completion = completion.strip() generation_dict["completion"] = completion @@ -219,6 +228,57 @@ def install_requirements(url): print(f"Error during installation: {e}") +def eval_livebench_coding(cfg): + try: + from livecodebench.evaluate import evaluate + except ImportError: + LOG.info("Package 'livecodebench' not found. Attempting to install...") + install_from_git("git+https://github.com/wasiahmad/livecodebench.git@livebench") + try: + from livecodebench.evaluate import evaluate + except ImportError: + LOG.info("Failed to install 'livecodebench'. Please install it manually.") + raise + + for jsonl_file in unroll_files(cfg.input_files): + samples = [] + with open(jsonl_file) as f: + for line in f: + sample = json.loads(line) + if sample["task"] == "coding_completion": + assert len(sample["partial_solution"]) > 0 + sample = preprocess_code(sample, strip_whitespace=False) + sample["completion"] = sample["completion"].replace("\t", " ") + full_solution = sample["partial_solution"] + "\n" + sample["completion"] + sample["code_list"] = [full_solution] + else: + sample = preprocess_code(sample, strip_whitespace=True) + sample["code_list"] = [sample["completion"]] + + samples.append(sample) + + with open(jsonl_file, "wt", encoding="utf-8") as f: + for sample in samples: + f.write(json.dumps(sample) + "\n") + + evaluate( + custom_output_file=jsonl_file, + k_list=[1], + num_process_evaluate=12, + timeout=6, + ) + + with open(jsonl_file[:-6] + "_eval_results.json", "rt", encoding="utf-8") as fin: + eval_grades = json.load(fin) + with open(jsonl_file, "wt", encoding="utf-8") as f: + for sample in samples: + sample["graded_list"] = eval_grades["eval"][sample["question_id"]]["graded_list"] + f.write(json.dumps(sample) + "\n") + + # moving eval file to ensure metrics are recomputed + shutil.move(jsonl_file[:-6] + "_eval_results.json", jsonl_file[:-6] + "_eval_results-saved.json") + + def install_or_upgrade_package(package_name): try: # Run the pip command to install or upgrade the package diff --git a/nemo_skills/evaluation/metrics/map_metrics.py b/nemo_skills/evaluation/metrics/map_metrics.py index 401bc30e6c..7c08b75c56 100644 --- a/nemo_skills/evaluation/metrics/map_metrics.py +++ b/nemo_skills/evaluation/metrics/map_metrics.py @@ -48,6 +48,7 @@ "scicode": SciCodeMetrics, "bigcodebench": BigCodeBenchMetrics, "mrcr": MRCRMetrics, + "livebench_coding": LiveCodeBenchMetrics, }