diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml
index 0908e9d58e..3c6cd2e0c7 100644
--- a/.github/workflows/tests.yml
+++ b/.github/workflows/tests.yml
@@ -85,7 +85,10 @@ jobs:
         NVIDIA_API_KEY: ${{ secrets.NVIDIA_API_KEY }}
         HF_TOKEN: ${{ secrets.HF_TOKEN }}
       run: |
-        docker run --rm --network=host nemo-skills-sandbox-image &
+        # Default shared runtime directory
+        sudo mkdir -p /nemo_run
+        sudo chmod 777 /nemo_run
+        docker run --rm --network=host -v /nemo_run:/nemo_run nemo-skills-sandbox-image &
         sleep 10
         set -o pipefail # this will make sure next line returns non-0 exit code if tests fail
         ns prepare_data gsm8k math-500
diff --git a/docs/evaluation/code.md b/docs/evaluation/code.md
index 5a8d634bcc..1b4a60a14e 100644
--- a/docs/evaluation/code.md
+++ b/docs/evaluation/code.md
@@ -185,10 +185,10 @@ We currently support IOI24 and are working to support IOI25 for evaluation. The
 
 #### Data Preparation
 
-First, prepare the dataset by running the `ns prepare_data` command. The arguments below will generate `test.jsonl` and `test_metadata.json`.
+First, prepare the dataset by running the `ns prepare_data` command. The arguments below will generate `ioi24.jsonl` and `ioi24_metadata.json`.
 
 ```
-ns prepare_data ioi24
+ns prepare_data ioi
 ```
 
 #### Running the Evaluation
@@ -209,10 +209,11 @@ ns eval \
     --server_gpus=8 \
     --benchmarks=ioi24:50 \
     --with_sandbox \
-    --split=test \
+    --split=ioi24 \
     --data_dir=<DATA_DIR> \
     --output_dir=<OUTPUT_DIR> \
-    --extra_eval_args="++eval_config.test_file=<PATH_TO_METADATA_TEST_FILE>" \
+    --eval_subfolder=eval-results/ioi24/ \ # set the folder if you want to differentiate subsets.
+    --extra_eval_args="++eval_config.test_file=<PATH_TO_METADATA_TEST_DIR>/ioi24_metadata.json" \
     ++inference.temperature=0.6 \
     ++inference.top_p=0.95 \
     ++inference.tokens_to_generate=65536
@@ -220,13 +221,12 @@ ns eval \
 
 ##### Verifying Results
 
-After all jobs are complete, you can check the results in `<OUTPUT_DIR>/eval-results/ioi24/metrics.json`. You can also take a look at `<OUTPUT_DIR>/eval-results/ioi24/summarized-results/main_*`. They should look something like this:
+After all jobs are complete, you can check the results in `<OUTPUT_DIR>/eval-results/ioi24/ioi/metrics.json`. You can also take a look at `<OUTPUT_DIR>/eval-results/ioi24/ioi/summarized-results/main_*`. They should look something like this:
 
 ```
------------------------------------------------------- ioi24 ------------------------------------------------------
-evaluation_mode   | num_entries | avg_tokens | gen_seconds | correct       | total_score        | round_robin_score
-pass@1[avg-of-50] | 39          | 40387      | 7410        | 0.51% ± 1.04% | 303.47             | 261.01
-pass@50           | 39          | 40387      | 7410        | 2.56%         | 303.47             | 261.01
+------------------------------------ ioi24 -------------------------------------
+evaluation_mode | num_entries | avg_tokens | gen_seconds | correct | total_score
+pass@50          | 39          | 52225      | 99630       | 23.08%  | 500
 ```
 
 ### livecodebench
diff --git a/nemo_skills/dataset/ioi24/__init__.py b/nemo_skills/dataset/ioi/__init__.py
similarity index 100%
rename from nemo_skills/dataset/ioi24/__init__.py
rename to nemo_skills/dataset/ioi/__init__.py
diff --git a/nemo_skills/dataset/ioi24/prepare.py b/nemo_skills/dataset/ioi/prepare.py
similarity index 93%
rename from nemo_skills/dataset/ioi24/prepare.py
rename to nemo_skills/dataset/ioi/prepare.py
index 656e480b60..3849607b0f 100644
--- a/nemo_skills/dataset/ioi24/prepare.py
+++ b/nemo_skills/dataset/ioi/prepare.py
@@ -27,6 +27,7 @@
 if __name__ == "__main__":
     parser = argparse.ArgumentParser()
     parser.add_argument("--split", type=str, default="test")
+    parser.add_argument("--suffix", type=str, default="24")
     args = parser.parse_args()
 
     data_dir = Path(__file__).absolute().parent
@@ -50,7 +51,7 @@
                 }
             )
 
-    with open(os.path.join(data_dir, f"{args.split}.jsonl"), "w") as f:
+    with open(os.path.join(data_dir, f"ioi{args.suffix}.jsonl"), "w") as f:
         f.write("\n".join(json.dumps(x) for x in entries))
 
     tests_dataset = load_dataset("open-r1/ioi-test-cases", name="2024", split="train")
@@ -82,5 +83,5 @@
             "grader_files": entry["grader_files"],
         }
 
-    with open(os.path.join(data_dir, f"{args.split}_metadata.json"), "w") as f:
+    with open(os.path.join(data_dir, f"ioi{args.suffix}_metadata.json"), "w") as f:
         json.dump(final_structure, f)
diff --git a/nemo_skills/dataset/ioi25/__init__.py b/nemo_skills/dataset/ioi25/__init__.py
deleted file mode 100644
index 3032b16653..0000000000
--- a/nemo_skills/dataset/ioi25/__init__.py
+++ /dev/null
@@ -1,31 +0,0 @@
-# Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-"""
-todo: We are working on providing the data files that are necessary to run IOI25 evaluation.
-"""
-
-# settings that define how evaluation should be done by default (all can be changed from cmdline)
-GENERATION_ARGS = "++prompt_config=generic/default ++eval_type=ioi"
-DATASET_GROUP = "code"
-METRICS_TYPE = "ioi"
-
-# environment variables required by this benchmark
-SANDBOX_ENV_VARS = [
-    "UWSGI_PROCESSES=1024",
-    "UWSGI_CPU_AFFINITY=8",
-    "UWSGI_CHEAPER=1023",
-    "NUM_WORKERS=1",
-    "STATEFUL_SANDBOX=0",
-]
diff --git a/nemo_skills/evaluation/evaluator/ioi.py b/nemo_skills/evaluation/evaluator/ioi.py
index 239a23db6c..9d1738518b 100644
--- a/nemo_skills/evaluation/evaluator/ioi.py
+++ b/nemo_skills/evaluation/evaluator/ioi.py
@@ -12,23 +12,25 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 import asyncio
+import hashlib
 import json
 import multiprocessing
 import os
 import re
+import shutil
 import threading
 import time
-from typing import Dict
 
 from nemo_skills.code_execution.sandbox import LocalSandbox
 from nemo_skills.evaluation.evaluator.base import BaseEvaluator, BaseEvaluatorConfig
 from nemo_skills.file_utils import jdump
-from nemo_skills.utils import nested_dataclass
+from nemo_skills.utils import nested_dataclass, unroll_files
 
 
 @nested_dataclass(kw_only=True)
 class IOIEvaluatorConfig(BaseEvaluatorConfig):
     test_file: str = "test_metadata.json"
+    input_file: str | None = None
     num_workers: int = 16  # number of test workers
     test_batch_size: int = 16  # number of tests to run concurrently
     overwrite: bool = False
@@ -40,6 +42,10 @@ class IOIEvaluatorConfig(BaseEvaluatorConfig):
 asyncio.set_event_loop(worker_loop)
 
 
+def sha256_hex(text: str) -> str:
+    return hashlib.sha256(text.encode("utf-8", errors="replace")).hexdigest()
+
+
 def _sandbox_exec_sync(sandbox: LocalSandbox, cmd: str, *, language: str = "shell", timeout: int = 120):
     """Run sandbox.execute_code synchronously with a persistent event loop.
 
@@ -88,29 +94,31 @@ def _precompile_grader(
         wait_for_sandbox(sandbox)
         sandbox._owner_tid = threading.get_ident()
 
-    pre_dir = f"/tmp/ioi_pre_{problem_name}_{os.getpid()}"
-    # Build shell script to create files and invoke compile.sh.
-    creation_cmds = [
-        f"mkdir -p {pre_dir}/graders",
-    ]
-    # Dump grader related files
+    pre_dir = f"/nemo_run/ioi_pre_{problem_name}_{os.getpid()}"
+    # Create directories and files locally; sandbox shares the same filesystem
+    os.makedirs(os.path.join(pre_dir, "graders"), exist_ok=True)
+
+    # Dump grader related files locally
     for filepath, content in grader_files:
-        dir_name = os.path.dirname(filepath)
-        if dir_name:
-            creation_cmds.append(f"mkdir -p {pre_dir}/{dir_name}")
-        creation_cmds.append(f"cat <<'_EOT_' > {pre_dir}/{filepath}\n{content}\n_EOT_\n")
-
-    # Write compile.sh and run.sh as provided (needed later in workers)
-    creation_cmds.append(
-        f"cat <<'_EOT_' > {pre_dir}/compile.sh\n{compile_code}\n_EOT_\nchmod +x {pre_dir}/compile.sh\n"
-    )
-    creation_cmds.append(f"cat <<'_EOT_' > {pre_dir}/run.sh\n{run_code}\n_EOT_\nchmod +x {pre_dir}/run.sh\n")
-
-    setup_script = "\n".join(creation_cmds)
-    # 1. create files
-    _sandbox_exec_sync(sandbox, setup_script, language="shell", timeout=120)
-
-    # 2. run compile.sh but ignore final failure when problem cpp missing
+        target_path = os.path.join(pre_dir, filepath)
+        target_dir = os.path.dirname(target_path)
+        if target_dir:
+            os.makedirs(target_dir, exist_ok=True)
+        with open(target_path, "w", encoding="utf-8") as f:
+            f.write(content)
+
+    # Write compile.sh and run.sh locally and make them executable
+    compile_path = os.path.join(pre_dir, "compile.sh")
+    with open(compile_path, "w", encoding="utf-8") as f:
+        f.write(compile_code)
+    os.chmod(compile_path, 0o755)
+
+    run_path = os.path.join(pre_dir, "run.sh")
+    with open(run_path, "w", encoding="utf-8") as f:
+        f.write(run_code)
+    os.chmod(run_path, 0o755)
+
+    # Run compile.sh inside the sandbox (same filesystem)
     _sandbox_exec_sync(sandbox, f"cd {pre_dir} && ./compile.sh || true", language="shell", timeout=120)
 
     return pre_dir
@@ -118,46 +126,28 @@ def _precompile_grader(
 
 def run_test_case(task_args: dict, worker_id: int) -> dict:
     # Use high-resolution timestamp to guarantee uniqueness across parallel calls.
-    unique_dir = f"/tmp/ioi_run_{worker_id}_{os.getpid()}_{time.time_ns()}"
+    unique_dir = f"/nemo_run/ioi_run_{worker_id}_{os.getpid()}_{time.time_ns()}"
 
     try:
-        # 1. Create all necessary files in one batch command
+        # 1. Create all necessary files locally (sandbox shares filesystem)
         precompiled_dir = task_args.get("precompiled_dir")
-        # Step 1: prepare the working directory and copy shared pre-compiled artifacts first
-        file_creation_commands = [
-            # Create the unique run directory itself
-            f"mkdir -p {unique_dir}",
-            # Ensure `graders/` directory exists
-            f"mkdir -p {unique_dir}/graders",
-            f"cp -r {precompiled_dir}/* {unique_dir}/",
-            # Next write the contestant's generated solution into the graders folder so it is not overwritten
-            f"cat <<'_EOT_' > {unique_dir}/graders/{task_args['problem_id']}.cpp\n{task_args['generated_code']}\n_EOT_\n",
-        ]
-
+        os.makedirs(unique_dir, exist_ok=True)
+        os.makedirs(os.path.join(unique_dir, "graders"), exist_ok=True)
+        # Copy precompiled assets into unique run directory
+        if precompiled_dir and os.path.isdir(precompiled_dir):
+            shutil.copytree(precompiled_dir, unique_dir, dirs_exist_ok=True)
+        # Write contestant solution
+        with open(os.path.join(unique_dir, "graders", f"{task_args['problem_id']}.cpp"), "w", encoding="utf-8") as f:
+            f.write(task_args["generated_code"])
         # Prepare input and expected output files
-        file_creation_commands.append(f"cat <<'_EOT_' > {unique_dir}/input.txt\n{task_args['test_input']}\n_EOT_\n")
-        file_creation_commands.append(
-            f"cat <<'_EOT_' > {unique_dir}/correct_output.txt\n{task_args['test_output']}\n_EOT_\n"
-        )
-
-        setup_script = "\n".join(file_creation_commands)
-        sandbox = LocalSandbox()
-        setup_result, _ = worker_loop.run_until_complete(
-            sandbox.execute_code(setup_script, language="shell", timeout=120)
-        )
-        if setup_result.get("stderr"):
-            raise Exception(f"File setup failed: {setup_result['stderr']}")
+        with open(os.path.join(unique_dir, "input.txt"), "w", encoding="utf-8") as f:
+            f.write(task_args["test_input"])
+        with open(os.path.join(unique_dir, "correct_output.txt"), "w", encoding="utf-8") as f:
+            f.write(task_args["test_output"])
 
         # 2. Compile only the problem solution (skip checker/grader recompilation)
-        # Compile the solution together with optional grader/stub sources without
-        # recompiling the checker/manager again.
-        compile_command = (
-            f"cd {unique_dir} && "
-            f'SRC="graders/{task_args["problem_id"]}.cpp"; '
-            f'[ -e graders/grader.cpp ] && SRC="$SRC graders/grader.cpp"; '
-            f'[ -e graders/stub.cpp ] && SRC="$SRC graders/stub.cpp"; '
-            f"g++ -DEVAL -std=gnu++17 -O2 -pipe -s -o graders/{task_args['problem_id']} $SRC"
-        )
+        compile_command = f"cd {unique_dir} && ./compile.sh"
+        sandbox = LocalSandbox()
         compile_result, _ = worker_loop.run_until_complete(
             sandbox.execute_code(compile_command, language="shell", timeout=120)
         )
@@ -202,11 +192,80 @@ def run_test_case(task_args: dict, worker_id: int) -> dict:
         return {"score": 0.0, "output": "", "error": str(e)}
 
     finally:
-        # 4. Clean up the directory
-        # Fire and forget; ignore return values
+        # 4. Clean up the directory locally
+        try:
+            shutil.rmtree(unique_dir, ignore_errors=True)
+        except Exception:
+            pass
+
+
+def run_input_case(task_args: dict, worker_id: int) -> dict:
+    # Use high-resolution timestamp to guarantee uniqueness across parallel calls.
+    unique_dir = f"/nemo_run/ioi_run_{worker_id}_{os.getpid()}_{time.time_ns()}"
+
+    try:
+        # 1. Create all necessary files locally (sandbox shares filesystem)
+        os.makedirs(unique_dir, exist_ok=True)
+        for filepath, content in task_args.get("run_files", []):
+            target_path = os.path.join(unique_dir, os.path.basename(filepath))
+            with open(target_path, "w", encoding="utf-8") as f:
+                f.write(content)
+        for fname in ("compile", "run"):
+            fpath = os.path.join(unique_dir, fname)
+            if os.path.exists(fpath):
+                os.chmod(fpath, 0o755)
+        # Write contestant solution into problem solution file
+        solution_path = os.path.join(unique_dir, f"{task_args['problem_id']}.cpp")
+        with open(solution_path, "w", encoding="utf-8") as f:
+            f.write(task_args["generated_code"])
+        # Prepare only input file (no ground-truth for input-only runs)
+        with open(os.path.join(unique_dir, "input.txt"), "w", encoding="utf-8") as f:
+            f.write(task_args["test_input"])
+
+        # 2. Compile using run_files toolchain
+        compile_command = f"cd {unique_dir} && ./compile"
+        sandbox = LocalSandbox()
+        compile_result, _ = worker_loop.run_until_complete(
+            sandbox.execute_code(compile_command, language="shell", timeout=120)
+        )
+
+        result = {
+            "compile_success": not compile_result.get("stderr"),
+            "compile_stdout": compile_result.get("stdout", ""),
+            "compile_stderr": compile_result.get("stderr", ""),
+            "run_stdout": "",
+            "run_stderr": "",
+            "error": "",
+        }
+
+        if not result["compile_success"]:
+            return result
+
+        # 3. Run the code using run_files runner
+        run_command = f"cd {unique_dir} && ./run < input.txt"
+        run_result, _ = worker_loop.run_until_complete(
+            sandbox.execute_code(run_command, language="shell", timeout=120, max_output_characters=1000000)
+        )
+
+        run_stdout = sha256_hex(run_result.get("stdout", ""))
+        run_stderr = run_result.get("stderr", "")
+
+        result.update(
+            {
+                "run_stdout": run_stdout,
+                "run_stderr": run_stderr,
+            }
+        )
+
+        return result
+
+    except Exception as e:
+        return {"run_stdout": "", "run_stderr": "", "error": str(e)}
+
+    finally:
+        # 4. Clean up the directory locally
         try:
-            sandbox = LocalSandbox()
-            worker_loop.run_until_complete(sandbox.execute_code(f"rm -rf {unique_dir}", language="shell", timeout=120))
+            shutil.rmtree(unique_dir, ignore_errors=True)
         except Exception:
             pass
 
@@ -250,10 +309,11 @@ def __init__(self, config: dict, num_parallel_requests: int = 10):
         self.eval_cfg = IOIEvaluatorConfig(_init_nested=True, **config)
 
         # Heavy runtime resources are lazily initialized within _evaluate_entry.
-        self.sandbox = None  # type: ignore
-        self.metadata = None  # type: ignore
-        self.precompiled_cache: Dict[str, str] = {}
-        self.pool = None  # type: ignore
+        self.sandbox = None
+        self.metadata = None
+        self.inputdata = None
+        self.precompiled_cache = {}
+        self.pool = None
 
     async def _initialize_runtime(self):
         """Asynchronously create sandbox and related runtime state on first use."""
@@ -275,14 +335,23 @@ def _setup():
                 )
             with open(self.eval_cfg.test_file, "r") as f:
                 metadata_local = json.load(f)
+            input_local = None
+            if self.eval_cfg.input_file:
+                if not os.path.exists(self.eval_cfg.input_file):
+                    raise FileNotFoundError(
+                        f"Input file {self.eval_cfg.input_file} does not exist."
+                        " Please provide a valid parameter for ++eval_config.input_file=x when running IOI Evaluation."
+                    )
+                with open(self.eval_cfg.input_file, "r") as f:
+                    input_local = json.load(f)
             pool_local = multiprocessing.Pool(
                 processes=self.eval_cfg.test_batch_size,
                 initializer=init_worker,
             )
 
-            return sbox, metadata_local, pool_local
+            return sbox, metadata_local, input_local, pool_local
 
-        self.sandbox, self.metadata, self.pool = await asyncio.to_thread(_setup)
+        self.sandbox, self.metadata, self.inputdata, self.pool = await asyncio.to_thread(_setup)
 
     # Internal helper
     async def _evaluate_entry(self, entry: dict) -> dict:
@@ -298,9 +367,10 @@ async def _evaluate_entry(self, entry: dict) -> dict:
         compile_code = subtask_meta["compile"]
         run_code = subtask_meta["run"]
         grader_files = subtask_meta["grader_files"]
+        run_files = subtask_meta.get("run_files", [])
 
         if pid not in self.precompiled_cache:
-            self.precompiled_cache[pid] = await asyncio.to_thread(
+            grader_dir = await asyncio.to_thread(
                 _precompile_grader,
                 pid,
                 grader_files,
@@ -308,7 +378,8 @@ async def _evaluate_entry(self, entry: dict) -> dict:
                 run_code,
                 self.sandbox,
             )
-        pre_dir = self.precompiled_cache[pid]
+            self.precompiled_cache[pid] = {"grader": grader_dir}
+        pre_dir = self.precompiled_cache[pid]["grader"]
 
         subtask_state = {
             st: {
@@ -368,25 +439,53 @@ async def _evaluate_entry(self, entry: dict) -> dict:
             score = round(min(data["scores"]) * data["score"], data["precision"]) if data["scores"] else 0.0
             test_case_results[st] = {"score": score, "outputs": data["outputs"]}
 
+        # Optionally run custom input cases
+        input_outputs = []
+        if self.inputdata is not None:
+            problem_inputs = self.inputdata[str(entry["id"])]
+            for i in range(0, len(problem_inputs), batch_size):
+                batch = problem_inputs[i : i + batch_size]
+                tasks = []
+                for test_data in batch:
+                    tasks.append(
+                        {
+                            "generated_code": completion,
+                            "problem_id": pid,
+                            "run_files": run_files,
+                            "test_input": test_data["content"],
+                        }
+                    )
+                # map with unique worker id argument
+                results = await asyncio.to_thread(
+                    self.pool.starmap, run_input_case, [(ta, idx) for idx, ta in enumerate(tasks)]
+                )
+                for test_data, result in zip(batch, results):
+                    test_name = test_data["file_name"]
+                    test_type = "input"
+                    result["test_name"] = test_name
+                    result["test_type"] = test_type
+                    input_outputs.append(result)
+
         return {
             "name": entry["name"],
             "subtask": entry["subtask"],
             "test_case_results": test_case_results,
+            "input_case_results": input_outputs,
         }
 
-    async def eval_full(self):  # type: ignore[override]
-        jsonl_file = self.eval_cfg.input_file
-        with open(jsonl_file, "r", encoding="utf-8") as f:
-            all_samples = [json.loads(line) for line in f]
+    async def eval_full(self, input_files):  # type: ignore[override]
+        for jsonl_file in unroll_files(input_files):
+            with open(jsonl_file, "r", encoding="utf-8") as f:
+                all_samples = [json.loads(line) for line in f]
 
-        tasks = [self._evaluate_entry(s) for s in all_samples]
-        outputs = await asyncio.gather(*tasks)
+            tasks = [self._evaluate_entry(s) for s in all_samples]
+            outputs = await asyncio.gather(*tasks)
 
-        for s, o in zip(all_samples, outputs):
-            s["test_case_results"] = o["test_case_results"]
-            s["eval_status"] = o["eval_status"]
+            for s, o in zip(all_samples, outputs):
+                s["test_case_results"] = o["test_case_results"]
+                s["input_case_results"] = o["input_case_results"]
 
-        jdump(all_samples, jsonl_file, mode="wt")
+            jdump(all_samples, jsonl_file, mode="wt")
 
         if self.pool is not None:
             self.pool.close()
diff --git a/nemo_skills/evaluation/metrics/ioi_metrics.py b/nemo_skills/evaluation/metrics/ioi_metrics.py
index a2028f6a6d..4f4431a3bd 100644
--- a/nemo_skills/evaluation/metrics/ioi_metrics.py
+++ b/nemo_skills/evaluation/metrics/ioi_metrics.py
@@ -11,15 +11,26 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
+import json
+import os
+import re
 from collections import defaultdict
 
 from nemo_skills.evaluation.metrics.base import BaseMetrics
 
 
+def extract_final_cpp_block(text):
+    pattern = r"```(?:cpp|Cpp)\s*\n(.*?)```"
+    matches = re.findall(pattern, text, re.DOTALL)
+    return matches[-1] if matches else ""
+
+
 class IOIMetrics(BaseMetrics):
-    def __init__(self):
+    def __init__(self, **kwargs):
         super().__init__()
         self.reset()
+        self.cluster_folder = kwargs.get("cluster_folder", None)
+        print(f"Cluster folder: {self.cluster_folder}")
 
     def update(self, predictions):
         super().update(predictions)
@@ -30,6 +41,54 @@ def update(self, predictions):
     def _get_score_dict(self, p):
         return {"correct": all(r["score"] > 0 for r in p["test_case_results"].values())}
 
+    def extract_info(self, submission) -> dict:
+        # Aggregate IOI per-submission scores for convenience
+        subtask_scores = [v["score"] for _, v in submission["test_case_results"].items()]
+        return {
+            "grade": subtask_scores,
+            "tokens": submission["num_generated_tokens"],
+            "code": extract_final_cpp_block(submission["generation"]),
+        }
+
+    def get_clusters(self, submissions) -> dict:
+        clusters = defaultdict(list)
+        id = 0
+
+        for submission in submissions:
+            input_results = submission.get("input_case_results", [])
+            run_outputs = []
+            for output in input_results:
+                if "run_stdout" not in output:
+                    continue
+                run_outputs.append(output["run_stdout"])
+            output_key = tuple(run_outputs)
+
+            extract_info = self.extract_info(submission)
+            if output_key not in clusters:
+                # Initialize per-subtask maxima and counts with this submission's scores
+                subtask_score_list = [res["score"] for _, res in submission["test_case_results"].items()]
+                clusters[output_key] = {
+                    "codes": [],
+                    "max_score": subtask_score_list[:],
+                    "max_score_solutions": [1] * len(subtask_score_list),
+                }
+            else:
+                # Update maxima and counts element-wise from this submission
+                subtask_score_list = [res["score"] for _, res in submission["test_case_results"].items()]
+                max_scores = clusters[output_key]["max_score"]
+                max_counts = clusters[output_key]["max_score_solutions"]
+                for idx, score_val in enumerate(subtask_score_list):
+                    if score_val > max_scores[idx]:
+                        max_scores[idx] = score_val
+                        max_counts[idx] = 1
+                    elif score_val == max_scores[idx]:
+                        max_counts[idx] += 1
+            clusters[output_key]["codes"].append(extract_info)
+
+            id = submission.get("id", id)
+
+        return clusters, id
+
     def get_problem_score(self, submissions) -> float:
         """
         For a given problem (list of submissions), compute the score as follows:
@@ -37,7 +96,7 @@ def get_problem_score(self, submissions) -> float:
           - Sum these maximum scores to get the problem score.
         """
         if not submissions:
-            return 0.0
+            return 0.0, {}
         subtask_scores = {}
 
         for submission in submissions:
@@ -45,63 +104,70 @@ def get_problem_score(self, submissions) -> float:
                 subtask_scores[subtask] = max(subtask_scores.get(subtask, 0), result["score"])
         return sum(subtask_scores.values()), subtask_scores
 
-    def simulate_round_robin_score(self, submissions) -> float:
-        """
-        Computes a round robin score for a problem.
-        The procedure is as follows:
-         1. For each submission, compute an aggregate score (sum of subtask scores).
-         2. Sort submissions in descending order by the aggregate score.
-         3. Select up to 50 submissions.
-         4. For each subtask, take the maximum score among the selected submissions.
-         5. Return the sum of these maximum subtask scores.
-        """
-        if not submissions:
-            return 0.0
-
-        # compute an aggregate score per submission
-        for submission in submissions:
-            aggregate_score = sum(result["score"] for result in submission["test_case_results"].values())
-            submission["_aggregate_score"] = aggregate_score
-
-        # sort submissions in descending order by aggregate score
-        sorted_submissions = sorted(submissions, key=lambda s: s["_aggregate_score"], reverse=True)
-        # Select up to 50 submissions.
-        selected = sorted_submissions[:50]
-
-        # for each subtask, take the maximum score among the selected submissions
-        subtask_scores = {}
-        for submission in selected:
-            for subtask, result in submission["test_case_results"].items():
-                subtask_scores[subtask] = max(subtask_scores.get(subtask, 0), result["score"])
-        return sum(subtask_scores.values())
-
     def get_metrics(self):
-        total_score = total_round_robin = 0.0
+        total_score = 0.0
         self.problem_scores = {}
         for name, submissions in self.predictions_by_problem.items():
+            # Cluster the submissions if requested
+            if self.cluster_folder:
+                os.makedirs(self.cluster_folder, exist_ok=True)
+                submissions_by_id = defaultdict(list)
+                for sub in submissions:
+                    submissions_by_id[sub["id"]].append(sub)
+                for sid, sid_submissions in submissions_by_id.items():
+                    clusters, _ = self.get_clusters(sid_submissions)
+                    final_clusters = {}
+                    for i, (output_key, cluster) in enumerate(clusters.items()):
+                        final_clusters[f"cluster_{i + 1}"] = {
+                            "output": output_key,
+                            "codes": cluster["codes"],
+                            "max_score": cluster["max_score"],
+                            "max_score_solutions": cluster["max_score_solutions"],
+                        }
+                    output_file = os.path.join(self.cluster_folder, f"{sid}_cluster.jsonl")
+                    with open(output_file, "w") as f:
+                        json.dump(final_clusters, f, indent=4)
+
             score, subtasks = self.get_problem_score(submissions)
             self.problem_scores[name] = (score, subtasks)
             total_score += score
-            total_round_robin += self.simulate_round_robin_score(submissions)
-        self.print_problem_scores()
+
+        per_problem_subtask_scores = {}
+        for name, (achieved_total, achieved_subtasks) in self.problem_scores.items():
+            submissions = self.predictions_by_problem[name]
+            max_subtasks = {}
+            for sub in submissions:
+                max_subtasks[sub["subtask"]] = sub["subtask_score"]
+            max_total = sum(max_subtasks.values())
+            per_problem_subtask_scores[name] = {
+                "total": {"score": achieved_total, "max_score": max_total},
+                "subtasks": {
+                    subtask: {"score": achieved, "max_score": max_subtasks[subtask]}
+                    for subtask, achieved in achieved_subtasks.items()
+                },
+            }
+
         metrics_dict = super().get_metrics()
         for m in metrics_dict.values():
-            m["total_score"], m["round_robin_score"] = str(total_score), str(total_round_robin)
+            m["total_score"] = int(total_score)
+            m["per_problem_subtask_scores"] = per_problem_subtask_scores
+        self.per_problem_subtask_scores = per_problem_subtask_scores
+        self.print_problem_scores()
         return metrics_dict
 
     def reset(self):
         super().reset()
         self.predictions_by_problem = defaultdict(list)
         self.problem_scores = {}
+        self.per_problem_subtask_scores = {}
+
+    def evaluations_to_print(self):
+        return [f"pass@{self.max_k}"]
 
     def print_problem_scores(self):
         print("---------------------------------Problem and subtask scores---------------------------------")
-        for name, (achieved_total, achieved_subtasks) in self.problem_scores.items():
-            submissions = self.predictions_by_problem[name]
-            max_subtasks = {}
-            for sub in submissions:
-                max_subtasks[sub["subtask"]] = sub["subtask_score"]
-            max_total = sum(max_subtasks.values())
-            print(f"# {name}: {achieved_total}/{max_total}")
-            for subtask, achieved in achieved_subtasks.items():
-                print(f"  {subtask}: {achieved}/{max_subtasks[subtask]}")
+        for name, info in self.per_problem_subtask_scores.items():
+            total = info["total"]
+            print(f"# {name}: {int(total['score'])}/{int(total['max_score'])}")
+            for subtask, subinfo in info["subtasks"].items():
+                print(f"  {subtask}: {int(subinfo['score'])}/{int(subinfo['max_score'])}")