diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml index 3c6cd2e0c7..0908e9d58e 100644 --- a/.github/workflows/tests.yml +++ b/.github/workflows/tests.yml @@ -85,10 +85,7 @@ jobs: NVIDIA_API_KEY: ${{ secrets.NVIDIA_API_KEY }} HF_TOKEN: ${{ secrets.HF_TOKEN }} run: | - # Default shared runtime directory - sudo mkdir -p /nemo_run - sudo chmod 777 /nemo_run - docker run --rm --network=host -v /nemo_run:/nemo_run nemo-skills-sandbox-image & + docker run --rm --network=host nemo-skills-sandbox-image & sleep 10 set -o pipefail # this will make sure next line returns non-0 exit code if tests fail ns prepare_data gsm8k math-500 diff --git a/docs/evaluation/code.md b/docs/evaluation/code.md index 1b4a60a14e..5a8d634bcc 100644 --- a/docs/evaluation/code.md +++ b/docs/evaluation/code.md @@ -185,10 +185,10 @@ We currently support IOI24 and are working to support IOI25 for evaluation. The #### Data Preparation -First, prepare the dataset by running the `ns prepare_data` command. The arguments below will generate `ioi24.jsonl` and `ioi24_metadata.json`. +First, prepare the dataset by running the `ns prepare_data` command. The arguments below will generate `test.jsonl` and `test_metadata.json`. ``` -ns prepare_data ioi +ns prepare_data ioi24 ``` #### Running the Evaluation @@ -209,11 +209,10 @@ ns eval \ --server_gpus=8 \ --benchmarks=ioi24:50 \ --with_sandbox \ - --split=ioi24 \ + --split=test \ --data_dir= \ --output_dir= \ - --eval_subfolder=eval-results/ioi24/ \ # set the folder if you want to differentiate subsets. - --extra_eval_args="++eval_config.test_file=/ioi24_metadata.json" \ + --extra_eval_args="++eval_config.test_file=" \ ++inference.temperature=0.6 \ ++inference.top_p=0.95 \ ++inference.tokens_to_generate=65536 @@ -221,12 +220,13 @@ ns eval \ ##### Verifying Results -After all jobs are complete, you can check the results in `/eval-results/ioi24/ioi/metrics.json`. You can also take a look at `/eval-results/ioi24/ioi/summarized-results/main_*`. They should look something like this: +After all jobs are complete, you can check the results in `/eval-results/ioi24/metrics.json`. You can also take a look at `/eval-results/ioi24/summarized-results/main_*`. They should look something like this: ``` ------------------------------------- ioi24 ------------------------------------- -evaluation_mode | num_entries | avg_tokens | gen_seconds | correct | total_score -pass@50 | 39 | 52225 | 99630 | 23.08% | 500 +------------------------------------------------------ ioi24 ------------------------------------------------------ +evaluation_mode | num_entries | avg_tokens | gen_seconds | correct | total_score | round_robin_score +pass@1[avg-of-50] | 39 | 40387 | 7410 | 0.51% ± 1.04% | 303.47 | 261.01 +pass@50 | 39 | 40387 | 7410 | 2.56% | 303.47 | 261.01 ``` ### livecodebench diff --git a/nemo_skills/dataset/ioi/__init__.py b/nemo_skills/dataset/ioi24/__init__.py similarity index 100% rename from nemo_skills/dataset/ioi/__init__.py rename to nemo_skills/dataset/ioi24/__init__.py diff --git a/nemo_skills/dataset/ioi/prepare.py b/nemo_skills/dataset/ioi24/prepare.py similarity index 93% rename from nemo_skills/dataset/ioi/prepare.py rename to nemo_skills/dataset/ioi24/prepare.py index 3849607b0f..656e480b60 100644 --- a/nemo_skills/dataset/ioi/prepare.py +++ b/nemo_skills/dataset/ioi24/prepare.py @@ -27,7 +27,6 @@ if __name__ == "__main__": parser = argparse.ArgumentParser() parser.add_argument("--split", type=str, default="test") - parser.add_argument("--suffix", type=str, default="24") args = parser.parse_args() data_dir = Path(__file__).absolute().parent @@ -51,7 +50,7 @@ } ) - with open(os.path.join(data_dir, f"ioi{args.suffix}.jsonl"), "w") as f: + with open(os.path.join(data_dir, f"{args.split}.jsonl"), "w") as f: f.write("\n".join(json.dumps(x) for x in entries)) tests_dataset = load_dataset("open-r1/ioi-test-cases", name="2024", split="train") @@ -83,5 +82,5 @@ "grader_files": entry["grader_files"], } - with open(os.path.join(data_dir, f"ioi{args.suffix}_metadata.json"), "w") as f: + with open(os.path.join(data_dir, f"{args.split}_metadata.json"), "w") as f: json.dump(final_structure, f) diff --git a/nemo_skills/dataset/ioi25/__init__.py b/nemo_skills/dataset/ioi25/__init__.py new file mode 100644 index 0000000000..3032b16653 --- /dev/null +++ b/nemo_skills/dataset/ioi25/__init__.py @@ -0,0 +1,31 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +""" +todo: We are working on providing the data files that are necessary to run IOI25 evaluation. +""" + +# settings that define how evaluation should be done by default (all can be changed from cmdline) +GENERATION_ARGS = "++prompt_config=generic/default ++eval_type=ioi" +DATASET_GROUP = "code" +METRICS_TYPE = "ioi" + +# environment variables required by this benchmark +SANDBOX_ENV_VARS = [ + "UWSGI_PROCESSES=1024", + "UWSGI_CPU_AFFINITY=8", + "UWSGI_CHEAPER=1023", + "NUM_WORKERS=1", + "STATEFUL_SANDBOX=0", +] diff --git a/nemo_skills/evaluation/evaluator/ioi.py b/nemo_skills/evaluation/evaluator/ioi.py index 9d1738518b..239a23db6c 100644 --- a/nemo_skills/evaluation/evaluator/ioi.py +++ b/nemo_skills/evaluation/evaluator/ioi.py @@ -12,25 +12,23 @@ # See the License for the specific language governing permissions and # limitations under the License. import asyncio -import hashlib import json import multiprocessing import os import re -import shutil import threading import time +from typing import Dict from nemo_skills.code_execution.sandbox import LocalSandbox from nemo_skills.evaluation.evaluator.base import BaseEvaluator, BaseEvaluatorConfig from nemo_skills.file_utils import jdump -from nemo_skills.utils import nested_dataclass, unroll_files +from nemo_skills.utils import nested_dataclass @nested_dataclass(kw_only=True) class IOIEvaluatorConfig(BaseEvaluatorConfig): test_file: str = "test_metadata.json" - input_file: str | None = None num_workers: int = 16 # number of test workers test_batch_size: int = 16 # number of tests to run concurrently overwrite: bool = False @@ -42,10 +40,6 @@ class IOIEvaluatorConfig(BaseEvaluatorConfig): asyncio.set_event_loop(worker_loop) -def sha256_hex(text: str) -> str: - return hashlib.sha256(text.encode("utf-8", errors="replace")).hexdigest() - - def _sandbox_exec_sync(sandbox: LocalSandbox, cmd: str, *, language: str = "shell", timeout: int = 120): """Run sandbox.execute_code synchronously with a persistent event loop. @@ -94,31 +88,29 @@ def _precompile_grader( wait_for_sandbox(sandbox) sandbox._owner_tid = threading.get_ident() - pre_dir = f"/nemo_run/ioi_pre_{problem_name}_{os.getpid()}" - # Create directories and files locally; sandbox shares the same filesystem - os.makedirs(os.path.join(pre_dir, "graders"), exist_ok=True) - - # Dump grader related files locally + pre_dir = f"/tmp/ioi_pre_{problem_name}_{os.getpid()}" + # Build shell script to create files and invoke compile.sh. + creation_cmds = [ + f"mkdir -p {pre_dir}/graders", + ] + # Dump grader related files for filepath, content in grader_files: - target_path = os.path.join(pre_dir, filepath) - target_dir = os.path.dirname(target_path) - if target_dir: - os.makedirs(target_dir, exist_ok=True) - with open(target_path, "w", encoding="utf-8") as f: - f.write(content) - - # Write compile.sh and run.sh locally and make them executable - compile_path = os.path.join(pre_dir, "compile.sh") - with open(compile_path, "w", encoding="utf-8") as f: - f.write(compile_code) - os.chmod(compile_path, 0o755) - - run_path = os.path.join(pre_dir, "run.sh") - with open(run_path, "w", encoding="utf-8") as f: - f.write(run_code) - os.chmod(run_path, 0o755) - - # Run compile.sh inside the sandbox (same filesystem) + dir_name = os.path.dirname(filepath) + if dir_name: + creation_cmds.append(f"mkdir -p {pre_dir}/{dir_name}") + creation_cmds.append(f"cat <<'_EOT_' > {pre_dir}/{filepath}\n{content}\n_EOT_\n") + + # Write compile.sh and run.sh as provided (needed later in workers) + creation_cmds.append( + f"cat <<'_EOT_' > {pre_dir}/compile.sh\n{compile_code}\n_EOT_\nchmod +x {pre_dir}/compile.sh\n" + ) + creation_cmds.append(f"cat <<'_EOT_' > {pre_dir}/run.sh\n{run_code}\n_EOT_\nchmod +x {pre_dir}/run.sh\n") + + setup_script = "\n".join(creation_cmds) + # 1. create files + _sandbox_exec_sync(sandbox, setup_script, language="shell", timeout=120) + + # 2. run compile.sh but ignore final failure when problem cpp missing _sandbox_exec_sync(sandbox, f"cd {pre_dir} && ./compile.sh || true", language="shell", timeout=120) return pre_dir @@ -126,28 +118,46 @@ def _precompile_grader( def run_test_case(task_args: dict, worker_id: int) -> dict: # Use high-resolution timestamp to guarantee uniqueness across parallel calls. - unique_dir = f"/nemo_run/ioi_run_{worker_id}_{os.getpid()}_{time.time_ns()}" + unique_dir = f"/tmp/ioi_run_{worker_id}_{os.getpid()}_{time.time_ns()}" try: - # 1. Create all necessary files locally (sandbox shares filesystem) + # 1. Create all necessary files in one batch command precompiled_dir = task_args.get("precompiled_dir") - os.makedirs(unique_dir, exist_ok=True) - os.makedirs(os.path.join(unique_dir, "graders"), exist_ok=True) - # Copy precompiled assets into unique run directory - if precompiled_dir and os.path.isdir(precompiled_dir): - shutil.copytree(precompiled_dir, unique_dir, dirs_exist_ok=True) - # Write contestant solution - with open(os.path.join(unique_dir, "graders", f"{task_args['problem_id']}.cpp"), "w", encoding="utf-8") as f: - f.write(task_args["generated_code"]) + # Step 1: prepare the working directory and copy shared pre-compiled artifacts first + file_creation_commands = [ + # Create the unique run directory itself + f"mkdir -p {unique_dir}", + # Ensure `graders/` directory exists + f"mkdir -p {unique_dir}/graders", + f"cp -r {precompiled_dir}/* {unique_dir}/", + # Next write the contestant's generated solution into the graders folder so it is not overwritten + f"cat <<'_EOT_' > {unique_dir}/graders/{task_args['problem_id']}.cpp\n{task_args['generated_code']}\n_EOT_\n", + ] + # Prepare input and expected output files - with open(os.path.join(unique_dir, "input.txt"), "w", encoding="utf-8") as f: - f.write(task_args["test_input"]) - with open(os.path.join(unique_dir, "correct_output.txt"), "w", encoding="utf-8") as f: - f.write(task_args["test_output"]) + file_creation_commands.append(f"cat <<'_EOT_' > {unique_dir}/input.txt\n{task_args['test_input']}\n_EOT_\n") + file_creation_commands.append( + f"cat <<'_EOT_' > {unique_dir}/correct_output.txt\n{task_args['test_output']}\n_EOT_\n" + ) - # 2. Compile only the problem solution (skip checker/grader recompilation) - compile_command = f"cd {unique_dir} && ./compile.sh" + setup_script = "\n".join(file_creation_commands) sandbox = LocalSandbox() + setup_result, _ = worker_loop.run_until_complete( + sandbox.execute_code(setup_script, language="shell", timeout=120) + ) + if setup_result.get("stderr"): + raise Exception(f"File setup failed: {setup_result['stderr']}") + + # 2. Compile only the problem solution (skip checker/grader recompilation) + # Compile the solution together with optional grader/stub sources without + # recompiling the checker/manager again. + compile_command = ( + f"cd {unique_dir} && " + f'SRC="graders/{task_args["problem_id"]}.cpp"; ' + f'[ -e graders/grader.cpp ] && SRC="$SRC graders/grader.cpp"; ' + f'[ -e graders/stub.cpp ] && SRC="$SRC graders/stub.cpp"; ' + f"g++ -DEVAL -std=gnu++17 -O2 -pipe -s -o graders/{task_args['problem_id']} $SRC" + ) compile_result, _ = worker_loop.run_until_complete( sandbox.execute_code(compile_command, language="shell", timeout=120) ) @@ -192,80 +202,11 @@ def run_test_case(task_args: dict, worker_id: int) -> dict: return {"score": 0.0, "output": "", "error": str(e)} finally: - # 4. Clean up the directory locally - try: - shutil.rmtree(unique_dir, ignore_errors=True) - except Exception: - pass - - -def run_input_case(task_args: dict, worker_id: int) -> dict: - # Use high-resolution timestamp to guarantee uniqueness across parallel calls. - unique_dir = f"/nemo_run/ioi_run_{worker_id}_{os.getpid()}_{time.time_ns()}" - - try: - # 1. Create all necessary files locally (sandbox shares filesystem) - os.makedirs(unique_dir, exist_ok=True) - for filepath, content in task_args.get("run_files", []): - target_path = os.path.join(unique_dir, os.path.basename(filepath)) - with open(target_path, "w", encoding="utf-8") as f: - f.write(content) - for fname in ("compile", "run"): - fpath = os.path.join(unique_dir, fname) - if os.path.exists(fpath): - os.chmod(fpath, 0o755) - # Write contestant solution into problem solution file - solution_path = os.path.join(unique_dir, f"{task_args['problem_id']}.cpp") - with open(solution_path, "w", encoding="utf-8") as f: - f.write(task_args["generated_code"]) - # Prepare only input file (no ground-truth for input-only runs) - with open(os.path.join(unique_dir, "input.txt"), "w", encoding="utf-8") as f: - f.write(task_args["test_input"]) - - # 2. Compile using run_files toolchain - compile_command = f"cd {unique_dir} && ./compile" - sandbox = LocalSandbox() - compile_result, _ = worker_loop.run_until_complete( - sandbox.execute_code(compile_command, language="shell", timeout=120) - ) - - result = { - "compile_success": not compile_result.get("stderr"), - "compile_stdout": compile_result.get("stdout", ""), - "compile_stderr": compile_result.get("stderr", ""), - "run_stdout": "", - "run_stderr": "", - "error": "", - } - - if not result["compile_success"]: - return result - - # 3. Run the code using run_files runner - run_command = f"cd {unique_dir} && ./run < input.txt" - run_result, _ = worker_loop.run_until_complete( - sandbox.execute_code(run_command, language="shell", timeout=120, max_output_characters=1000000) - ) - - run_stdout = sha256_hex(run_result.get("stdout", "")) - run_stderr = run_result.get("stderr", "") - - result.update( - { - "run_stdout": run_stdout, - "run_stderr": run_stderr, - } - ) - - return result - - except Exception as e: - return {"run_stdout": "", "run_stderr": "", "error": str(e)} - - finally: - # 4. Clean up the directory locally + # 4. Clean up the directory + # Fire and forget; ignore return values try: - shutil.rmtree(unique_dir, ignore_errors=True) + sandbox = LocalSandbox() + worker_loop.run_until_complete(sandbox.execute_code(f"rm -rf {unique_dir}", language="shell", timeout=120)) except Exception: pass @@ -309,11 +250,10 @@ def __init__(self, config: dict, num_parallel_requests: int = 10): self.eval_cfg = IOIEvaluatorConfig(_init_nested=True, **config) # Heavy runtime resources are lazily initialized within _evaluate_entry. - self.sandbox = None - self.metadata = None - self.inputdata = None - self.precompiled_cache = {} - self.pool = None + self.sandbox = None # type: ignore + self.metadata = None # type: ignore + self.precompiled_cache: Dict[str, str] = {} + self.pool = None # type: ignore async def _initialize_runtime(self): """Asynchronously create sandbox and related runtime state on first use.""" @@ -335,23 +275,14 @@ def _setup(): ) with open(self.eval_cfg.test_file, "r") as f: metadata_local = json.load(f) - input_local = None - if self.eval_cfg.input_file: - if not os.path.exists(self.eval_cfg.input_file): - raise FileNotFoundError( - f"Input file {self.eval_cfg.input_file} does not exist." - " Please provide a valid parameter for ++eval_config.input_file=x when running IOI Evaluation." - ) - with open(self.eval_cfg.input_file, "r") as f: - input_local = json.load(f) pool_local = multiprocessing.Pool( processes=self.eval_cfg.test_batch_size, initializer=init_worker, ) - return sbox, metadata_local, input_local, pool_local + return sbox, metadata_local, pool_local - self.sandbox, self.metadata, self.inputdata, self.pool = await asyncio.to_thread(_setup) + self.sandbox, self.metadata, self.pool = await asyncio.to_thread(_setup) # Internal helper async def _evaluate_entry(self, entry: dict) -> dict: @@ -367,10 +298,9 @@ async def _evaluate_entry(self, entry: dict) -> dict: compile_code = subtask_meta["compile"] run_code = subtask_meta["run"] grader_files = subtask_meta["grader_files"] - run_files = subtask_meta.get("run_files", []) if pid not in self.precompiled_cache: - grader_dir = await asyncio.to_thread( + self.precompiled_cache[pid] = await asyncio.to_thread( _precompile_grader, pid, grader_files, @@ -378,8 +308,7 @@ async def _evaluate_entry(self, entry: dict) -> dict: run_code, self.sandbox, ) - self.precompiled_cache[pid] = {"grader": grader_dir} - pre_dir = self.precompiled_cache[pid]["grader"] + pre_dir = self.precompiled_cache[pid] subtask_state = { st: { @@ -439,53 +368,25 @@ async def _evaluate_entry(self, entry: dict) -> dict: score = round(min(data["scores"]) * data["score"], data["precision"]) if data["scores"] else 0.0 test_case_results[st] = {"score": score, "outputs": data["outputs"]} - # Optionally run custom input cases - input_outputs = [] - if self.inputdata is not None: - problem_inputs = self.inputdata[str(entry["id"])] - for i in range(0, len(problem_inputs), batch_size): - batch = problem_inputs[i : i + batch_size] - tasks = [] - for test_data in batch: - tasks.append( - { - "generated_code": completion, - "problem_id": pid, - "run_files": run_files, - "test_input": test_data["content"], - } - ) - # map with unique worker id argument - results = await asyncio.to_thread( - self.pool.starmap, run_input_case, [(ta, idx) for idx, ta in enumerate(tasks)] - ) - for test_data, result in zip(batch, results): - test_name = test_data["file_name"] - test_type = "input" - result["test_name"] = test_name - result["test_type"] = test_type - input_outputs.append(result) - return { "name": entry["name"], "subtask": entry["subtask"], "test_case_results": test_case_results, - "input_case_results": input_outputs, } - async def eval_full(self, input_files): # type: ignore[override] - for jsonl_file in unroll_files(input_files): - with open(jsonl_file, "r", encoding="utf-8") as f: - all_samples = [json.loads(line) for line in f] + async def eval_full(self): # type: ignore[override] + jsonl_file = self.eval_cfg.input_file + with open(jsonl_file, "r", encoding="utf-8") as f: + all_samples = [json.loads(line) for line in f] - tasks = [self._evaluate_entry(s) for s in all_samples] - outputs = await asyncio.gather(*tasks) + tasks = [self._evaluate_entry(s) for s in all_samples] + outputs = await asyncio.gather(*tasks) - for s, o in zip(all_samples, outputs): - s["test_case_results"] = o["test_case_results"] - s["input_case_results"] = o["input_case_results"] + for s, o in zip(all_samples, outputs): + s["test_case_results"] = o["test_case_results"] + s["eval_status"] = o["eval_status"] - jdump(all_samples, jsonl_file, mode="wt") + jdump(all_samples, jsonl_file, mode="wt") if self.pool is not None: self.pool.close() diff --git a/nemo_skills/evaluation/metrics/ioi_metrics.py b/nemo_skills/evaluation/metrics/ioi_metrics.py index 4f4431a3bd..a2028f6a6d 100644 --- a/nemo_skills/evaluation/metrics/ioi_metrics.py +++ b/nemo_skills/evaluation/metrics/ioi_metrics.py @@ -11,26 +11,15 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -import json -import os -import re from collections import defaultdict from nemo_skills.evaluation.metrics.base import BaseMetrics -def extract_final_cpp_block(text): - pattern = r"```(?:cpp|Cpp)\s*\n(.*?)```" - matches = re.findall(pattern, text, re.DOTALL) - return matches[-1] if matches else "" - - class IOIMetrics(BaseMetrics): - def __init__(self, **kwargs): + def __init__(self): super().__init__() self.reset() - self.cluster_folder = kwargs.get("cluster_folder", None) - print(f"Cluster folder: {self.cluster_folder}") def update(self, predictions): super().update(predictions) @@ -41,54 +30,6 @@ def update(self, predictions): def _get_score_dict(self, p): return {"correct": all(r["score"] > 0 for r in p["test_case_results"].values())} - def extract_info(self, submission) -> dict: - # Aggregate IOI per-submission scores for convenience - subtask_scores = [v["score"] for _, v in submission["test_case_results"].items()] - return { - "grade": subtask_scores, - "tokens": submission["num_generated_tokens"], - "code": extract_final_cpp_block(submission["generation"]), - } - - def get_clusters(self, submissions) -> dict: - clusters = defaultdict(list) - id = 0 - - for submission in submissions: - input_results = submission.get("input_case_results", []) - run_outputs = [] - for output in input_results: - if "run_stdout" not in output: - continue - run_outputs.append(output["run_stdout"]) - output_key = tuple(run_outputs) - - extract_info = self.extract_info(submission) - if output_key not in clusters: - # Initialize per-subtask maxima and counts with this submission's scores - subtask_score_list = [res["score"] for _, res in submission["test_case_results"].items()] - clusters[output_key] = { - "codes": [], - "max_score": subtask_score_list[:], - "max_score_solutions": [1] * len(subtask_score_list), - } - else: - # Update maxima and counts element-wise from this submission - subtask_score_list = [res["score"] for _, res in submission["test_case_results"].items()] - max_scores = clusters[output_key]["max_score"] - max_counts = clusters[output_key]["max_score_solutions"] - for idx, score_val in enumerate(subtask_score_list): - if score_val > max_scores[idx]: - max_scores[idx] = score_val - max_counts[idx] = 1 - elif score_val == max_scores[idx]: - max_counts[idx] += 1 - clusters[output_key]["codes"].append(extract_info) - - id = submission.get("id", id) - - return clusters, id - def get_problem_score(self, submissions) -> float: """ For a given problem (list of submissions), compute the score as follows: @@ -96,7 +37,7 @@ def get_problem_score(self, submissions) -> float: - Sum these maximum scores to get the problem score. """ if not submissions: - return 0.0, {} + return 0.0 subtask_scores = {} for submission in submissions: @@ -104,70 +45,63 @@ def get_problem_score(self, submissions) -> float: subtask_scores[subtask] = max(subtask_scores.get(subtask, 0), result["score"]) return sum(subtask_scores.values()), subtask_scores + def simulate_round_robin_score(self, submissions) -> float: + """ + Computes a round robin score for a problem. + The procedure is as follows: + 1. For each submission, compute an aggregate score (sum of subtask scores). + 2. Sort submissions in descending order by the aggregate score. + 3. Select up to 50 submissions. + 4. For each subtask, take the maximum score among the selected submissions. + 5. Return the sum of these maximum subtask scores. + """ + if not submissions: + return 0.0 + + # compute an aggregate score per submission + for submission in submissions: + aggregate_score = sum(result["score"] for result in submission["test_case_results"].values()) + submission["_aggregate_score"] = aggregate_score + + # sort submissions in descending order by aggregate score + sorted_submissions = sorted(submissions, key=lambda s: s["_aggregate_score"], reverse=True) + # Select up to 50 submissions. + selected = sorted_submissions[:50] + + # for each subtask, take the maximum score among the selected submissions + subtask_scores = {} + for submission in selected: + for subtask, result in submission["test_case_results"].items(): + subtask_scores[subtask] = max(subtask_scores.get(subtask, 0), result["score"]) + return sum(subtask_scores.values()) + def get_metrics(self): - total_score = 0.0 + total_score = total_round_robin = 0.0 self.problem_scores = {} for name, submissions in self.predictions_by_problem.items(): - # Cluster the submissions if requested - if self.cluster_folder: - os.makedirs(self.cluster_folder, exist_ok=True) - submissions_by_id = defaultdict(list) - for sub in submissions: - submissions_by_id[sub["id"]].append(sub) - for sid, sid_submissions in submissions_by_id.items(): - clusters, _ = self.get_clusters(sid_submissions) - final_clusters = {} - for i, (output_key, cluster) in enumerate(clusters.items()): - final_clusters[f"cluster_{i + 1}"] = { - "output": output_key, - "codes": cluster["codes"], - "max_score": cluster["max_score"], - "max_score_solutions": cluster["max_score_solutions"], - } - output_file = os.path.join(self.cluster_folder, f"{sid}_cluster.jsonl") - with open(output_file, "w") as f: - json.dump(final_clusters, f, indent=4) - score, subtasks = self.get_problem_score(submissions) self.problem_scores[name] = (score, subtasks) total_score += score - - per_problem_subtask_scores = {} - for name, (achieved_total, achieved_subtasks) in self.problem_scores.items(): - submissions = self.predictions_by_problem[name] - max_subtasks = {} - for sub in submissions: - max_subtasks[sub["subtask"]] = sub["subtask_score"] - max_total = sum(max_subtasks.values()) - per_problem_subtask_scores[name] = { - "total": {"score": achieved_total, "max_score": max_total}, - "subtasks": { - subtask: {"score": achieved, "max_score": max_subtasks[subtask]} - for subtask, achieved in achieved_subtasks.items() - }, - } - + total_round_robin += self.simulate_round_robin_score(submissions) + self.print_problem_scores() metrics_dict = super().get_metrics() for m in metrics_dict.values(): - m["total_score"] = int(total_score) - m["per_problem_subtask_scores"] = per_problem_subtask_scores - self.per_problem_subtask_scores = per_problem_subtask_scores - self.print_problem_scores() + m["total_score"], m["round_robin_score"] = str(total_score), str(total_round_robin) return metrics_dict def reset(self): super().reset() self.predictions_by_problem = defaultdict(list) self.problem_scores = {} - self.per_problem_subtask_scores = {} - - def evaluations_to_print(self): - return [f"pass@{self.max_k}"] def print_problem_scores(self): print("---------------------------------Problem and subtask scores---------------------------------") - for name, info in self.per_problem_subtask_scores.items(): - total = info["total"] - print(f"# {name}: {int(total['score'])}/{int(total['max_score'])}") - for subtask, subinfo in info["subtasks"].items(): - print(f" {subtask}: {int(subinfo['score'])}/{int(subinfo['max_score'])}") + for name, (achieved_total, achieved_subtasks) in self.problem_scores.items(): + submissions = self.predictions_by_problem[name] + max_subtasks = {} + for sub in submissions: + max_subtasks[sub["subtask"]] = sub["subtask_score"] + max_total = sum(max_subtasks.values()) + print(f"# {name}: {achieved_total}/{max_total}") + for subtask, achieved in achieved_subtasks.items(): + print(f" {subtask}: {achieved}/{max_subtasks[subtask]}")