diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml index 0908e9d58e..3c6cd2e0c7 100644 --- a/.github/workflows/tests.yml +++ b/.github/workflows/tests.yml @@ -85,7 +85,10 @@ jobs: NVIDIA_API_KEY: ${{ secrets.NVIDIA_API_KEY }} HF_TOKEN: ${{ secrets.HF_TOKEN }} run: | - docker run --rm --network=host nemo-skills-sandbox-image & + # Default shared runtime directory + sudo mkdir -p /nemo_run + sudo chmod 777 /nemo_run + docker run --rm --network=host -v /nemo_run:/nemo_run nemo-skills-sandbox-image & sleep 10 set -o pipefail # this will make sure next line returns non-0 exit code if tests fail ns prepare_data gsm8k math-500 diff --git a/docs/evaluation/code.md b/docs/evaluation/code.md index 5a8d634bcc..1b4a60a14e 100644 --- a/docs/evaluation/code.md +++ b/docs/evaluation/code.md @@ -185,10 +185,10 @@ We currently support IOI24 and are working to support IOI25 for evaluation. The #### Data Preparation -First, prepare the dataset by running the `ns prepare_data` command. The arguments below will generate `test.jsonl` and `test_metadata.json`. +First, prepare the dataset by running the `ns prepare_data` command. The arguments below will generate `ioi24.jsonl` and `ioi24_metadata.json`. ``` -ns prepare_data ioi24 +ns prepare_data ioi ``` #### Running the Evaluation @@ -209,10 +209,11 @@ ns eval \ --server_gpus=8 \ --benchmarks=ioi24:50 \ --with_sandbox \ - --split=test \ + --split=ioi24 \ --data_dir= \ --output_dir= \ - --extra_eval_args="++eval_config.test_file=" \ + --eval_subfolder=eval-results/ioi24/ \ # set the folder if you want to differentiate subsets. + --extra_eval_args="++eval_config.test_file=/ioi24_metadata.json" \ ++inference.temperature=0.6 \ ++inference.top_p=0.95 \ ++inference.tokens_to_generate=65536 @@ -220,13 +221,12 @@ ns eval \ ##### Verifying Results -After all jobs are complete, you can check the results in `/eval-results/ioi24/metrics.json`. You can also take a look at `/eval-results/ioi24/summarized-results/main_*`. They should look something like this: +After all jobs are complete, you can check the results in `/eval-results/ioi24/ioi/metrics.json`. You can also take a look at `/eval-results/ioi24/ioi/summarized-results/main_*`. They should look something like this: ``` ------------------------------------------------------- ioi24 ------------------------------------------------------ -evaluation_mode | num_entries | avg_tokens | gen_seconds | correct | total_score | round_robin_score -pass@1[avg-of-50] | 39 | 40387 | 7410 | 0.51% ± 1.04% | 303.47 | 261.01 -pass@50 | 39 | 40387 | 7410 | 2.56% | 303.47 | 261.01 +------------------------------------ ioi24 ------------------------------------- +evaluation_mode | num_entries | avg_tokens | gen_seconds | correct | total_score +pass@50 | 39 | 52225 | 99630 | 23.08% | 500 ``` ### livecodebench diff --git a/nemo_skills/dataset/ioi24/__init__.py b/nemo_skills/dataset/ioi/__init__.py similarity index 100% rename from nemo_skills/dataset/ioi24/__init__.py rename to nemo_skills/dataset/ioi/__init__.py diff --git a/nemo_skills/dataset/ioi24/prepare.py b/nemo_skills/dataset/ioi/prepare.py similarity index 93% rename from nemo_skills/dataset/ioi24/prepare.py rename to nemo_skills/dataset/ioi/prepare.py index 656e480b60..3849607b0f 100644 --- a/nemo_skills/dataset/ioi24/prepare.py +++ b/nemo_skills/dataset/ioi/prepare.py @@ -27,6 +27,7 @@ if __name__ == "__main__": parser = argparse.ArgumentParser() parser.add_argument("--split", type=str, default="test") + parser.add_argument("--suffix", type=str, default="24") args = parser.parse_args() data_dir = Path(__file__).absolute().parent @@ -50,7 +51,7 @@ } ) - with open(os.path.join(data_dir, f"{args.split}.jsonl"), "w") as f: + with open(os.path.join(data_dir, f"ioi{args.suffix}.jsonl"), "w") as f: f.write("\n".join(json.dumps(x) for x in entries)) tests_dataset = load_dataset("open-r1/ioi-test-cases", name="2024", split="train") @@ -82,5 +83,5 @@ "grader_files": entry["grader_files"], } - with open(os.path.join(data_dir, f"{args.split}_metadata.json"), "w") as f: + with open(os.path.join(data_dir, f"ioi{args.suffix}_metadata.json"), "w") as f: json.dump(final_structure, f) diff --git a/nemo_skills/dataset/ioi25/__init__.py b/nemo_skills/dataset/ioi25/__init__.py deleted file mode 100644 index 3032b16653..0000000000 --- a/nemo_skills/dataset/ioi25/__init__.py +++ /dev/null @@ -1,31 +0,0 @@ -# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -""" -todo: We are working on providing the data files that are necessary to run IOI25 evaluation. -""" - -# settings that define how evaluation should be done by default (all can be changed from cmdline) -GENERATION_ARGS = "++prompt_config=generic/default ++eval_type=ioi" -DATASET_GROUP = "code" -METRICS_TYPE = "ioi" - -# environment variables required by this benchmark -SANDBOX_ENV_VARS = [ - "UWSGI_PROCESSES=1024", - "UWSGI_CPU_AFFINITY=8", - "UWSGI_CHEAPER=1023", - "NUM_WORKERS=1", - "STATEFUL_SANDBOX=0", -] diff --git a/nemo_skills/evaluation/evaluator/ioi.py b/nemo_skills/evaluation/evaluator/ioi.py index 239a23db6c..9d1738518b 100644 --- a/nemo_skills/evaluation/evaluator/ioi.py +++ b/nemo_skills/evaluation/evaluator/ioi.py @@ -12,23 +12,25 @@ # See the License for the specific language governing permissions and # limitations under the License. import asyncio +import hashlib import json import multiprocessing import os import re +import shutil import threading import time -from typing import Dict from nemo_skills.code_execution.sandbox import LocalSandbox from nemo_skills.evaluation.evaluator.base import BaseEvaluator, BaseEvaluatorConfig from nemo_skills.file_utils import jdump -from nemo_skills.utils import nested_dataclass +from nemo_skills.utils import nested_dataclass, unroll_files @nested_dataclass(kw_only=True) class IOIEvaluatorConfig(BaseEvaluatorConfig): test_file: str = "test_metadata.json" + input_file: str | None = None num_workers: int = 16 # number of test workers test_batch_size: int = 16 # number of tests to run concurrently overwrite: bool = False @@ -40,6 +42,10 @@ class IOIEvaluatorConfig(BaseEvaluatorConfig): asyncio.set_event_loop(worker_loop) +def sha256_hex(text: str) -> str: + return hashlib.sha256(text.encode("utf-8", errors="replace")).hexdigest() + + def _sandbox_exec_sync(sandbox: LocalSandbox, cmd: str, *, language: str = "shell", timeout: int = 120): """Run sandbox.execute_code synchronously with a persistent event loop. @@ -88,29 +94,31 @@ def _precompile_grader( wait_for_sandbox(sandbox) sandbox._owner_tid = threading.get_ident() - pre_dir = f"/tmp/ioi_pre_{problem_name}_{os.getpid()}" - # Build shell script to create files and invoke compile.sh. - creation_cmds = [ - f"mkdir -p {pre_dir}/graders", - ] - # Dump grader related files + pre_dir = f"/nemo_run/ioi_pre_{problem_name}_{os.getpid()}" + # Create directories and files locally; sandbox shares the same filesystem + os.makedirs(os.path.join(pre_dir, "graders"), exist_ok=True) + + # Dump grader related files locally for filepath, content in grader_files: - dir_name = os.path.dirname(filepath) - if dir_name: - creation_cmds.append(f"mkdir -p {pre_dir}/{dir_name}") - creation_cmds.append(f"cat <<'_EOT_' > {pre_dir}/{filepath}\n{content}\n_EOT_\n") - - # Write compile.sh and run.sh as provided (needed later in workers) - creation_cmds.append( - f"cat <<'_EOT_' > {pre_dir}/compile.sh\n{compile_code}\n_EOT_\nchmod +x {pre_dir}/compile.sh\n" - ) - creation_cmds.append(f"cat <<'_EOT_' > {pre_dir}/run.sh\n{run_code}\n_EOT_\nchmod +x {pre_dir}/run.sh\n") - - setup_script = "\n".join(creation_cmds) - # 1. create files - _sandbox_exec_sync(sandbox, setup_script, language="shell", timeout=120) - - # 2. run compile.sh but ignore final failure when problem cpp missing + target_path = os.path.join(pre_dir, filepath) + target_dir = os.path.dirname(target_path) + if target_dir: + os.makedirs(target_dir, exist_ok=True) + with open(target_path, "w", encoding="utf-8") as f: + f.write(content) + + # Write compile.sh and run.sh locally and make them executable + compile_path = os.path.join(pre_dir, "compile.sh") + with open(compile_path, "w", encoding="utf-8") as f: + f.write(compile_code) + os.chmod(compile_path, 0o755) + + run_path = os.path.join(pre_dir, "run.sh") + with open(run_path, "w", encoding="utf-8") as f: + f.write(run_code) + os.chmod(run_path, 0o755) + + # Run compile.sh inside the sandbox (same filesystem) _sandbox_exec_sync(sandbox, f"cd {pre_dir} && ./compile.sh || true", language="shell", timeout=120) return pre_dir @@ -118,46 +126,28 @@ def _precompile_grader( def run_test_case(task_args: dict, worker_id: int) -> dict: # Use high-resolution timestamp to guarantee uniqueness across parallel calls. - unique_dir = f"/tmp/ioi_run_{worker_id}_{os.getpid()}_{time.time_ns()}" + unique_dir = f"/nemo_run/ioi_run_{worker_id}_{os.getpid()}_{time.time_ns()}" try: - # 1. Create all necessary files in one batch command + # 1. Create all necessary files locally (sandbox shares filesystem) precompiled_dir = task_args.get("precompiled_dir") - # Step 1: prepare the working directory and copy shared pre-compiled artifacts first - file_creation_commands = [ - # Create the unique run directory itself - f"mkdir -p {unique_dir}", - # Ensure `graders/` directory exists - f"mkdir -p {unique_dir}/graders", - f"cp -r {precompiled_dir}/* {unique_dir}/", - # Next write the contestant's generated solution into the graders folder so it is not overwritten - f"cat <<'_EOT_' > {unique_dir}/graders/{task_args['problem_id']}.cpp\n{task_args['generated_code']}\n_EOT_\n", - ] - + os.makedirs(unique_dir, exist_ok=True) + os.makedirs(os.path.join(unique_dir, "graders"), exist_ok=True) + # Copy precompiled assets into unique run directory + if precompiled_dir and os.path.isdir(precompiled_dir): + shutil.copytree(precompiled_dir, unique_dir, dirs_exist_ok=True) + # Write contestant solution + with open(os.path.join(unique_dir, "graders", f"{task_args['problem_id']}.cpp"), "w", encoding="utf-8") as f: + f.write(task_args["generated_code"]) # Prepare input and expected output files - file_creation_commands.append(f"cat <<'_EOT_' > {unique_dir}/input.txt\n{task_args['test_input']}\n_EOT_\n") - file_creation_commands.append( - f"cat <<'_EOT_' > {unique_dir}/correct_output.txt\n{task_args['test_output']}\n_EOT_\n" - ) - - setup_script = "\n".join(file_creation_commands) - sandbox = LocalSandbox() - setup_result, _ = worker_loop.run_until_complete( - sandbox.execute_code(setup_script, language="shell", timeout=120) - ) - if setup_result.get("stderr"): - raise Exception(f"File setup failed: {setup_result['stderr']}") + with open(os.path.join(unique_dir, "input.txt"), "w", encoding="utf-8") as f: + f.write(task_args["test_input"]) + with open(os.path.join(unique_dir, "correct_output.txt"), "w", encoding="utf-8") as f: + f.write(task_args["test_output"]) # 2. Compile only the problem solution (skip checker/grader recompilation) - # Compile the solution together with optional grader/stub sources without - # recompiling the checker/manager again. - compile_command = ( - f"cd {unique_dir} && " - f'SRC="graders/{task_args["problem_id"]}.cpp"; ' - f'[ -e graders/grader.cpp ] && SRC="$SRC graders/grader.cpp"; ' - f'[ -e graders/stub.cpp ] && SRC="$SRC graders/stub.cpp"; ' - f"g++ -DEVAL -std=gnu++17 -O2 -pipe -s -o graders/{task_args['problem_id']} $SRC" - ) + compile_command = f"cd {unique_dir} && ./compile.sh" + sandbox = LocalSandbox() compile_result, _ = worker_loop.run_until_complete( sandbox.execute_code(compile_command, language="shell", timeout=120) ) @@ -202,11 +192,80 @@ def run_test_case(task_args: dict, worker_id: int) -> dict: return {"score": 0.0, "output": "", "error": str(e)} finally: - # 4. Clean up the directory - # Fire and forget; ignore return values + # 4. Clean up the directory locally + try: + shutil.rmtree(unique_dir, ignore_errors=True) + except Exception: + pass + + +def run_input_case(task_args: dict, worker_id: int) -> dict: + # Use high-resolution timestamp to guarantee uniqueness across parallel calls. + unique_dir = f"/nemo_run/ioi_run_{worker_id}_{os.getpid()}_{time.time_ns()}" + + try: + # 1. Create all necessary files locally (sandbox shares filesystem) + os.makedirs(unique_dir, exist_ok=True) + for filepath, content in task_args.get("run_files", []): + target_path = os.path.join(unique_dir, os.path.basename(filepath)) + with open(target_path, "w", encoding="utf-8") as f: + f.write(content) + for fname in ("compile", "run"): + fpath = os.path.join(unique_dir, fname) + if os.path.exists(fpath): + os.chmod(fpath, 0o755) + # Write contestant solution into problem solution file + solution_path = os.path.join(unique_dir, f"{task_args['problem_id']}.cpp") + with open(solution_path, "w", encoding="utf-8") as f: + f.write(task_args["generated_code"]) + # Prepare only input file (no ground-truth for input-only runs) + with open(os.path.join(unique_dir, "input.txt"), "w", encoding="utf-8") as f: + f.write(task_args["test_input"]) + + # 2. Compile using run_files toolchain + compile_command = f"cd {unique_dir} && ./compile" + sandbox = LocalSandbox() + compile_result, _ = worker_loop.run_until_complete( + sandbox.execute_code(compile_command, language="shell", timeout=120) + ) + + result = { + "compile_success": not compile_result.get("stderr"), + "compile_stdout": compile_result.get("stdout", ""), + "compile_stderr": compile_result.get("stderr", ""), + "run_stdout": "", + "run_stderr": "", + "error": "", + } + + if not result["compile_success"]: + return result + + # 3. Run the code using run_files runner + run_command = f"cd {unique_dir} && ./run < input.txt" + run_result, _ = worker_loop.run_until_complete( + sandbox.execute_code(run_command, language="shell", timeout=120, max_output_characters=1000000) + ) + + run_stdout = sha256_hex(run_result.get("stdout", "")) + run_stderr = run_result.get("stderr", "") + + result.update( + { + "run_stdout": run_stdout, + "run_stderr": run_stderr, + } + ) + + return result + + except Exception as e: + return {"run_stdout": "", "run_stderr": "", "error": str(e)} + + finally: + # 4. Clean up the directory locally try: - sandbox = LocalSandbox() - worker_loop.run_until_complete(sandbox.execute_code(f"rm -rf {unique_dir}", language="shell", timeout=120)) + shutil.rmtree(unique_dir, ignore_errors=True) except Exception: pass @@ -250,10 +309,11 @@ def __init__(self, config: dict, num_parallel_requests: int = 10): self.eval_cfg = IOIEvaluatorConfig(_init_nested=True, **config) # Heavy runtime resources are lazily initialized within _evaluate_entry. - self.sandbox = None # type: ignore - self.metadata = None # type: ignore - self.precompiled_cache: Dict[str, str] = {} - self.pool = None # type: ignore + self.sandbox = None + self.metadata = None + self.inputdata = None + self.precompiled_cache = {} + self.pool = None async def _initialize_runtime(self): """Asynchronously create sandbox and related runtime state on first use.""" @@ -275,14 +335,23 @@ def _setup(): ) with open(self.eval_cfg.test_file, "r") as f: metadata_local = json.load(f) + input_local = None + if self.eval_cfg.input_file: + if not os.path.exists(self.eval_cfg.input_file): + raise FileNotFoundError( + f"Input file {self.eval_cfg.input_file} does not exist." + " Please provide a valid parameter for ++eval_config.input_file=x when running IOI Evaluation." + ) + with open(self.eval_cfg.input_file, "r") as f: + input_local = json.load(f) pool_local = multiprocessing.Pool( processes=self.eval_cfg.test_batch_size, initializer=init_worker, ) - return sbox, metadata_local, pool_local + return sbox, metadata_local, input_local, pool_local - self.sandbox, self.metadata, self.pool = await asyncio.to_thread(_setup) + self.sandbox, self.metadata, self.inputdata, self.pool = await asyncio.to_thread(_setup) # Internal helper async def _evaluate_entry(self, entry: dict) -> dict: @@ -298,9 +367,10 @@ async def _evaluate_entry(self, entry: dict) -> dict: compile_code = subtask_meta["compile"] run_code = subtask_meta["run"] grader_files = subtask_meta["grader_files"] + run_files = subtask_meta.get("run_files", []) if pid not in self.precompiled_cache: - self.precompiled_cache[pid] = await asyncio.to_thread( + grader_dir = await asyncio.to_thread( _precompile_grader, pid, grader_files, @@ -308,7 +378,8 @@ async def _evaluate_entry(self, entry: dict) -> dict: run_code, self.sandbox, ) - pre_dir = self.precompiled_cache[pid] + self.precompiled_cache[pid] = {"grader": grader_dir} + pre_dir = self.precompiled_cache[pid]["grader"] subtask_state = { st: { @@ -368,25 +439,53 @@ async def _evaluate_entry(self, entry: dict) -> dict: score = round(min(data["scores"]) * data["score"], data["precision"]) if data["scores"] else 0.0 test_case_results[st] = {"score": score, "outputs": data["outputs"]} + # Optionally run custom input cases + input_outputs = [] + if self.inputdata is not None: + problem_inputs = self.inputdata[str(entry["id"])] + for i in range(0, len(problem_inputs), batch_size): + batch = problem_inputs[i : i + batch_size] + tasks = [] + for test_data in batch: + tasks.append( + { + "generated_code": completion, + "problem_id": pid, + "run_files": run_files, + "test_input": test_data["content"], + } + ) + # map with unique worker id argument + results = await asyncio.to_thread( + self.pool.starmap, run_input_case, [(ta, idx) for idx, ta in enumerate(tasks)] + ) + for test_data, result in zip(batch, results): + test_name = test_data["file_name"] + test_type = "input" + result["test_name"] = test_name + result["test_type"] = test_type + input_outputs.append(result) + return { "name": entry["name"], "subtask": entry["subtask"], "test_case_results": test_case_results, + "input_case_results": input_outputs, } - async def eval_full(self): # type: ignore[override] - jsonl_file = self.eval_cfg.input_file - with open(jsonl_file, "r", encoding="utf-8") as f: - all_samples = [json.loads(line) for line in f] + async def eval_full(self, input_files): # type: ignore[override] + for jsonl_file in unroll_files(input_files): + with open(jsonl_file, "r", encoding="utf-8") as f: + all_samples = [json.loads(line) for line in f] - tasks = [self._evaluate_entry(s) for s in all_samples] - outputs = await asyncio.gather(*tasks) + tasks = [self._evaluate_entry(s) for s in all_samples] + outputs = await asyncio.gather(*tasks) - for s, o in zip(all_samples, outputs): - s["test_case_results"] = o["test_case_results"] - s["eval_status"] = o["eval_status"] + for s, o in zip(all_samples, outputs): + s["test_case_results"] = o["test_case_results"] + s["input_case_results"] = o["input_case_results"] - jdump(all_samples, jsonl_file, mode="wt") + jdump(all_samples, jsonl_file, mode="wt") if self.pool is not None: self.pool.close() diff --git a/nemo_skills/evaluation/metrics/ioi_metrics.py b/nemo_skills/evaluation/metrics/ioi_metrics.py index a2028f6a6d..4f4431a3bd 100644 --- a/nemo_skills/evaluation/metrics/ioi_metrics.py +++ b/nemo_skills/evaluation/metrics/ioi_metrics.py @@ -11,15 +11,26 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. +import json +import os +import re from collections import defaultdict from nemo_skills.evaluation.metrics.base import BaseMetrics +def extract_final_cpp_block(text): + pattern = r"```(?:cpp|Cpp)\s*\n(.*?)```" + matches = re.findall(pattern, text, re.DOTALL) + return matches[-1] if matches else "" + + class IOIMetrics(BaseMetrics): - def __init__(self): + def __init__(self, **kwargs): super().__init__() self.reset() + self.cluster_folder = kwargs.get("cluster_folder", None) + print(f"Cluster folder: {self.cluster_folder}") def update(self, predictions): super().update(predictions) @@ -30,6 +41,54 @@ def update(self, predictions): def _get_score_dict(self, p): return {"correct": all(r["score"] > 0 for r in p["test_case_results"].values())} + def extract_info(self, submission) -> dict: + # Aggregate IOI per-submission scores for convenience + subtask_scores = [v["score"] for _, v in submission["test_case_results"].items()] + return { + "grade": subtask_scores, + "tokens": submission["num_generated_tokens"], + "code": extract_final_cpp_block(submission["generation"]), + } + + def get_clusters(self, submissions) -> dict: + clusters = defaultdict(list) + id = 0 + + for submission in submissions: + input_results = submission.get("input_case_results", []) + run_outputs = [] + for output in input_results: + if "run_stdout" not in output: + continue + run_outputs.append(output["run_stdout"]) + output_key = tuple(run_outputs) + + extract_info = self.extract_info(submission) + if output_key not in clusters: + # Initialize per-subtask maxima and counts with this submission's scores + subtask_score_list = [res["score"] for _, res in submission["test_case_results"].items()] + clusters[output_key] = { + "codes": [], + "max_score": subtask_score_list[:], + "max_score_solutions": [1] * len(subtask_score_list), + } + else: + # Update maxima and counts element-wise from this submission + subtask_score_list = [res["score"] for _, res in submission["test_case_results"].items()] + max_scores = clusters[output_key]["max_score"] + max_counts = clusters[output_key]["max_score_solutions"] + for idx, score_val in enumerate(subtask_score_list): + if score_val > max_scores[idx]: + max_scores[idx] = score_val + max_counts[idx] = 1 + elif score_val == max_scores[idx]: + max_counts[idx] += 1 + clusters[output_key]["codes"].append(extract_info) + + id = submission.get("id", id) + + return clusters, id + def get_problem_score(self, submissions) -> float: """ For a given problem (list of submissions), compute the score as follows: @@ -37,7 +96,7 @@ def get_problem_score(self, submissions) -> float: - Sum these maximum scores to get the problem score. """ if not submissions: - return 0.0 + return 0.0, {} subtask_scores = {} for submission in submissions: @@ -45,63 +104,70 @@ def get_problem_score(self, submissions) -> float: subtask_scores[subtask] = max(subtask_scores.get(subtask, 0), result["score"]) return sum(subtask_scores.values()), subtask_scores - def simulate_round_robin_score(self, submissions) -> float: - """ - Computes a round robin score for a problem. - The procedure is as follows: - 1. For each submission, compute an aggregate score (sum of subtask scores). - 2. Sort submissions in descending order by the aggregate score. - 3. Select up to 50 submissions. - 4. For each subtask, take the maximum score among the selected submissions. - 5. Return the sum of these maximum subtask scores. - """ - if not submissions: - return 0.0 - - # compute an aggregate score per submission - for submission in submissions: - aggregate_score = sum(result["score"] for result in submission["test_case_results"].values()) - submission["_aggregate_score"] = aggregate_score - - # sort submissions in descending order by aggregate score - sorted_submissions = sorted(submissions, key=lambda s: s["_aggregate_score"], reverse=True) - # Select up to 50 submissions. - selected = sorted_submissions[:50] - - # for each subtask, take the maximum score among the selected submissions - subtask_scores = {} - for submission in selected: - for subtask, result in submission["test_case_results"].items(): - subtask_scores[subtask] = max(subtask_scores.get(subtask, 0), result["score"]) - return sum(subtask_scores.values()) - def get_metrics(self): - total_score = total_round_robin = 0.0 + total_score = 0.0 self.problem_scores = {} for name, submissions in self.predictions_by_problem.items(): + # Cluster the submissions if requested + if self.cluster_folder: + os.makedirs(self.cluster_folder, exist_ok=True) + submissions_by_id = defaultdict(list) + for sub in submissions: + submissions_by_id[sub["id"]].append(sub) + for sid, sid_submissions in submissions_by_id.items(): + clusters, _ = self.get_clusters(sid_submissions) + final_clusters = {} + for i, (output_key, cluster) in enumerate(clusters.items()): + final_clusters[f"cluster_{i + 1}"] = { + "output": output_key, + "codes": cluster["codes"], + "max_score": cluster["max_score"], + "max_score_solutions": cluster["max_score_solutions"], + } + output_file = os.path.join(self.cluster_folder, f"{sid}_cluster.jsonl") + with open(output_file, "w") as f: + json.dump(final_clusters, f, indent=4) + score, subtasks = self.get_problem_score(submissions) self.problem_scores[name] = (score, subtasks) total_score += score - total_round_robin += self.simulate_round_robin_score(submissions) - self.print_problem_scores() + + per_problem_subtask_scores = {} + for name, (achieved_total, achieved_subtasks) in self.problem_scores.items(): + submissions = self.predictions_by_problem[name] + max_subtasks = {} + for sub in submissions: + max_subtasks[sub["subtask"]] = sub["subtask_score"] + max_total = sum(max_subtasks.values()) + per_problem_subtask_scores[name] = { + "total": {"score": achieved_total, "max_score": max_total}, + "subtasks": { + subtask: {"score": achieved, "max_score": max_subtasks[subtask]} + for subtask, achieved in achieved_subtasks.items() + }, + } + metrics_dict = super().get_metrics() for m in metrics_dict.values(): - m["total_score"], m["round_robin_score"] = str(total_score), str(total_round_robin) + m["total_score"] = int(total_score) + m["per_problem_subtask_scores"] = per_problem_subtask_scores + self.per_problem_subtask_scores = per_problem_subtask_scores + self.print_problem_scores() return metrics_dict def reset(self): super().reset() self.predictions_by_problem = defaultdict(list) self.problem_scores = {} + self.per_problem_subtask_scores = {} + + def evaluations_to_print(self): + return [f"pass@{self.max_k}"] def print_problem_scores(self): print("---------------------------------Problem and subtask scores---------------------------------") - for name, (achieved_total, achieved_subtasks) in self.problem_scores.items(): - submissions = self.predictions_by_problem[name] - max_subtasks = {} - for sub in submissions: - max_subtasks[sub["subtask"]] = sub["subtask_score"] - max_total = sum(max_subtasks.values()) - print(f"# {name}: {achieved_total}/{max_total}") - for subtask, achieved in achieved_subtasks.items(): - print(f" {subtask}: {achieved}/{max_subtasks[subtask]}") + for name, info in self.per_problem_subtask_scores.items(): + total = info["total"] + print(f"# {name}: {int(total['score'])}/{int(total['max_score'])}") + for subtask, subinfo in info["subtasks"].items(): + print(f" {subtask}: {int(subinfo['score'])}/{int(subinfo['max_score'])}")