diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml
index 3c6cd2e0c7..0908e9d58e 100644
--- a/.github/workflows/tests.yml
+++ b/.github/workflows/tests.yml
@@ -85,10 +85,7 @@ jobs:
         NVIDIA_API_KEY: ${{ secrets.NVIDIA_API_KEY }}
         HF_TOKEN: ${{ secrets.HF_TOKEN }}
       run: |
-        # Default shared runtime directory
-        sudo mkdir -p /nemo_run
-        sudo chmod 777 /nemo_run
-        docker run --rm --network=host -v /nemo_run:/nemo_run nemo-skills-sandbox-image &
+        docker run --rm --network=host nemo-skills-sandbox-image &
         sleep 10
         set -o pipefail # this will make sure next line returns non-0 exit code if tests fail
         ns prepare_data gsm8k math-500
diff --git a/docs/evaluation/code.md b/docs/evaluation/code.md
index 1b4a60a14e..5a8d634bcc 100644
--- a/docs/evaluation/code.md
+++ b/docs/evaluation/code.md
@@ -185,10 +185,10 @@ We currently support IOI24 and are working to support IOI25 for evaluation. The
 
 #### Data Preparation
 
-First, prepare the dataset by running the `ns prepare_data` command. The arguments below will generate `ioi24.jsonl` and `ioi24_metadata.json`.
+First, prepare the dataset by running the `ns prepare_data` command. The arguments below will generate `test.jsonl` and `test_metadata.json`.
 
 ```
-ns prepare_data ioi
+ns prepare_data ioi24
 ```
 
 #### Running the Evaluation
@@ -209,11 +209,10 @@ ns eval \
     --server_gpus=8 \
     --benchmarks=ioi24:50 \
     --with_sandbox \
-    --split=ioi24 \
+    --split=test \
     --data_dir=<DATA_DIR> \
     --output_dir=<OUTPUT_DIR> \
-    --eval_subfolder=eval-results/ioi24/ \ # set the folder if you want to differentiate subsets.
-    --extra_eval_args="++eval_config.test_file=<PATH_TO_METADATA_TEST_DIR>/ioi24_metadata.json" \
+    --extra_eval_args="++eval_config.test_file=<PATH_TO_METADATA_TEST_FILE>" \
     ++inference.temperature=0.6 \
     ++inference.top_p=0.95 \
     ++inference.tokens_to_generate=65536
@@ -221,12 +220,13 @@ ns eval \
 
 ##### Verifying Results
 
-After all jobs are complete, you can check the results in `<OUTPUT_DIR>/eval-results/ioi24/ioi/metrics.json`. You can also take a look at `<OUTPUT_DIR>/eval-results/ioi24/ioi/summarized-results/main_*`. They should look something like this:
+After all jobs are complete, you can check the results in `<OUTPUT_DIR>/eval-results/ioi24/metrics.json`. You can also take a look at `<OUTPUT_DIR>/eval-results/ioi24/summarized-results/main_*`. They should look something like this:
 
 ```
------------------------------------- ioi24 -------------------------------------
-evaluation_mode | num_entries | avg_tokens | gen_seconds | correct | total_score
-pass@50          | 39          | 52225      | 99630       | 23.08%  | 500
+------------------------------------------------------ ioi24 ------------------------------------------------------
+evaluation_mode   | num_entries | avg_tokens | gen_seconds | correct       | total_score        | round_robin_score
+pass@1[avg-of-50] | 39          | 40387      | 7410        | 0.51% ± 1.04% | 303.47             | 261.01
+pass@50           | 39          | 40387      | 7410        | 2.56%         | 303.47             | 261.01
 ```
 
 ### livecodebench
diff --git a/nemo_skills/dataset/ioi/__init__.py b/nemo_skills/dataset/ioi24/__init__.py
similarity index 100%
rename from nemo_skills/dataset/ioi/__init__.py
rename to nemo_skills/dataset/ioi24/__init__.py
diff --git a/nemo_skills/dataset/ioi/prepare.py b/nemo_skills/dataset/ioi24/prepare.py
similarity index 93%
rename from nemo_skills/dataset/ioi/prepare.py
rename to nemo_skills/dataset/ioi24/prepare.py
index 3849607b0f..656e480b60 100644
--- a/nemo_skills/dataset/ioi/prepare.py
+++ b/nemo_skills/dataset/ioi24/prepare.py
@@ -27,7 +27,6 @@
 if __name__ == "__main__":
     parser = argparse.ArgumentParser()
     parser.add_argument("--split", type=str, default="test")
-    parser.add_argument("--suffix", type=str, default="24")
     args = parser.parse_args()
 
     data_dir = Path(__file__).absolute().parent
@@ -51,7 +50,7 @@
                 }
             )
 
-    with open(os.path.join(data_dir, f"ioi{args.suffix}.jsonl"), "w") as f:
+    with open(os.path.join(data_dir, f"{args.split}.jsonl"), "w") as f:
         f.write("\n".join(json.dumps(x) for x in entries))
 
     tests_dataset = load_dataset("open-r1/ioi-test-cases", name="2024", split="train")
@@ -83,5 +82,5 @@
             "grader_files": entry["grader_files"],
         }
 
-    with open(os.path.join(data_dir, f"ioi{args.suffix}_metadata.json"), "w") as f:
+    with open(os.path.join(data_dir, f"{args.split}_metadata.json"), "w") as f:
         json.dump(final_structure, f)
diff --git a/nemo_skills/dataset/ioi25/__init__.py b/nemo_skills/dataset/ioi25/__init__.py
new file mode 100644
index 0000000000..3032b16653
--- /dev/null
+++ b/nemo_skills/dataset/ioi25/__init__.py
@@ -0,0 +1,31 @@
+# Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""
+todo: We are working on providing the data files that are necessary to run IOI25 evaluation.
+"""
+
+# settings that define how evaluation should be done by default (all can be changed from cmdline)
+GENERATION_ARGS = "++prompt_config=generic/default ++eval_type=ioi"
+DATASET_GROUP = "code"
+METRICS_TYPE = "ioi"
+
+# environment variables required by this benchmark
+SANDBOX_ENV_VARS = [
+    "UWSGI_PROCESSES=1024",
+    "UWSGI_CPU_AFFINITY=8",
+    "UWSGI_CHEAPER=1023",
+    "NUM_WORKERS=1",
+    "STATEFUL_SANDBOX=0",
+]
diff --git a/nemo_skills/evaluation/evaluator/ioi.py b/nemo_skills/evaluation/evaluator/ioi.py
index 9d1738518b..239a23db6c 100644
--- a/nemo_skills/evaluation/evaluator/ioi.py
+++ b/nemo_skills/evaluation/evaluator/ioi.py
@@ -12,25 +12,23 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 import asyncio
-import hashlib
 import json
 import multiprocessing
 import os
 import re
-import shutil
 import threading
 import time
+from typing import Dict
 
 from nemo_skills.code_execution.sandbox import LocalSandbox
 from nemo_skills.evaluation.evaluator.base import BaseEvaluator, BaseEvaluatorConfig
 from nemo_skills.file_utils import jdump
-from nemo_skills.utils import nested_dataclass, unroll_files
+from nemo_skills.utils import nested_dataclass
 
 
 @nested_dataclass(kw_only=True)
 class IOIEvaluatorConfig(BaseEvaluatorConfig):
     test_file: str = "test_metadata.json"
-    input_file: str | None = None
     num_workers: int = 16  # number of test workers
     test_batch_size: int = 16  # number of tests to run concurrently
     overwrite: bool = False
@@ -42,10 +40,6 @@ class IOIEvaluatorConfig(BaseEvaluatorConfig):
 asyncio.set_event_loop(worker_loop)
 
 
-def sha256_hex(text: str) -> str:
-    return hashlib.sha256(text.encode("utf-8", errors="replace")).hexdigest()
-
-
 def _sandbox_exec_sync(sandbox: LocalSandbox, cmd: str, *, language: str = "shell", timeout: int = 120):
     """Run sandbox.execute_code synchronously with a persistent event loop.
 
@@ -94,31 +88,29 @@ def _precompile_grader(
         wait_for_sandbox(sandbox)
         sandbox._owner_tid = threading.get_ident()
 
-    pre_dir = f"/nemo_run/ioi_pre_{problem_name}_{os.getpid()}"
-    # Create directories and files locally; sandbox shares the same filesystem
-    os.makedirs(os.path.join(pre_dir, "graders"), exist_ok=True)
-
-    # Dump grader related files locally
+    pre_dir = f"/tmp/ioi_pre_{problem_name}_{os.getpid()}"
+    # Build shell script to create files and invoke compile.sh.
+    creation_cmds = [
+        f"mkdir -p {pre_dir}/graders",
+    ]
+    # Dump grader related files
     for filepath, content in grader_files:
-        target_path = os.path.join(pre_dir, filepath)
-        target_dir = os.path.dirname(target_path)
-        if target_dir:
-            os.makedirs(target_dir, exist_ok=True)
-        with open(target_path, "w", encoding="utf-8") as f:
-            f.write(content)
-
-    # Write compile.sh and run.sh locally and make them executable
-    compile_path = os.path.join(pre_dir, "compile.sh")
-    with open(compile_path, "w", encoding="utf-8") as f:
-        f.write(compile_code)
-    os.chmod(compile_path, 0o755)
-
-    run_path = os.path.join(pre_dir, "run.sh")
-    with open(run_path, "w", encoding="utf-8") as f:
-        f.write(run_code)
-    os.chmod(run_path, 0o755)
-
-    # Run compile.sh inside the sandbox (same filesystem)
+        dir_name = os.path.dirname(filepath)
+        if dir_name:
+            creation_cmds.append(f"mkdir -p {pre_dir}/{dir_name}")
+        creation_cmds.append(f"cat <<'_EOT_' > {pre_dir}/{filepath}\n{content}\n_EOT_\n")
+
+    # Write compile.sh and run.sh as provided (needed later in workers)
+    creation_cmds.append(
+        f"cat <<'_EOT_' > {pre_dir}/compile.sh\n{compile_code}\n_EOT_\nchmod +x {pre_dir}/compile.sh\n"
+    )
+    creation_cmds.append(f"cat <<'_EOT_' > {pre_dir}/run.sh\n{run_code}\n_EOT_\nchmod +x {pre_dir}/run.sh\n")
+
+    setup_script = "\n".join(creation_cmds)
+    # 1. create files
+    _sandbox_exec_sync(sandbox, setup_script, language="shell", timeout=120)
+
+    # 2. run compile.sh but ignore final failure when problem cpp missing
     _sandbox_exec_sync(sandbox, f"cd {pre_dir} && ./compile.sh || true", language="shell", timeout=120)
 
     return pre_dir
@@ -126,28 +118,46 @@ def _precompile_grader(
 
 def run_test_case(task_args: dict, worker_id: int) -> dict:
     # Use high-resolution timestamp to guarantee uniqueness across parallel calls.
-    unique_dir = f"/nemo_run/ioi_run_{worker_id}_{os.getpid()}_{time.time_ns()}"
+    unique_dir = f"/tmp/ioi_run_{worker_id}_{os.getpid()}_{time.time_ns()}"
 
     try:
-        # 1. Create all necessary files locally (sandbox shares filesystem)
+        # 1. Create all necessary files in one batch command
         precompiled_dir = task_args.get("precompiled_dir")
-        os.makedirs(unique_dir, exist_ok=True)
-        os.makedirs(os.path.join(unique_dir, "graders"), exist_ok=True)
-        # Copy precompiled assets into unique run directory
-        if precompiled_dir and os.path.isdir(precompiled_dir):
-            shutil.copytree(precompiled_dir, unique_dir, dirs_exist_ok=True)
-        # Write contestant solution
-        with open(os.path.join(unique_dir, "graders", f"{task_args['problem_id']}.cpp"), "w", encoding="utf-8") as f:
-            f.write(task_args["generated_code"])
+        # Step 1: prepare the working directory and copy shared pre-compiled artifacts first
+        file_creation_commands = [
+            # Create the unique run directory itself
+            f"mkdir -p {unique_dir}",
+            # Ensure `graders/` directory exists
+            f"mkdir -p {unique_dir}/graders",
+            f"cp -r {precompiled_dir}/* {unique_dir}/",
+            # Next write the contestant's generated solution into the graders folder so it is not overwritten
+            f"cat <<'_EOT_' > {unique_dir}/graders/{task_args['problem_id']}.cpp\n{task_args['generated_code']}\n_EOT_\n",
+        ]
+
         # Prepare input and expected output files
-        with open(os.path.join(unique_dir, "input.txt"), "w", encoding="utf-8") as f:
-            f.write(task_args["test_input"])
-        with open(os.path.join(unique_dir, "correct_output.txt"), "w", encoding="utf-8") as f:
-            f.write(task_args["test_output"])
+        file_creation_commands.append(f"cat <<'_EOT_' > {unique_dir}/input.txt\n{task_args['test_input']}\n_EOT_\n")
+        file_creation_commands.append(
+            f"cat <<'_EOT_' > {unique_dir}/correct_output.txt\n{task_args['test_output']}\n_EOT_\n"
+        )
 
-        # 2. Compile only the problem solution (skip checker/grader recompilation)
-        compile_command = f"cd {unique_dir} && ./compile.sh"
+        setup_script = "\n".join(file_creation_commands)
         sandbox = LocalSandbox()
+        setup_result, _ = worker_loop.run_until_complete(
+            sandbox.execute_code(setup_script, language="shell", timeout=120)
+        )
+        if setup_result.get("stderr"):
+            raise Exception(f"File setup failed: {setup_result['stderr']}")
+
+        # 2. Compile only the problem solution (skip checker/grader recompilation)
+        # Compile the solution together with optional grader/stub sources without
+        # recompiling the checker/manager again.
+        compile_command = (
+            f"cd {unique_dir} && "
+            f'SRC="graders/{task_args["problem_id"]}.cpp"; '
+            f'[ -e graders/grader.cpp ] && SRC="$SRC graders/grader.cpp"; '
+            f'[ -e graders/stub.cpp ] && SRC="$SRC graders/stub.cpp"; '
+            f"g++ -DEVAL -std=gnu++17 -O2 -pipe -s -o graders/{task_args['problem_id']} $SRC"
+        )
         compile_result, _ = worker_loop.run_until_complete(
             sandbox.execute_code(compile_command, language="shell", timeout=120)
         )
@@ -192,80 +202,11 @@ def run_test_case(task_args: dict, worker_id: int) -> dict:
         return {"score": 0.0, "output": "", "error": str(e)}
 
     finally:
-        # 4. Clean up the directory locally
-        try:
-            shutil.rmtree(unique_dir, ignore_errors=True)
-        except Exception:
-            pass
-
-
-def run_input_case(task_args: dict, worker_id: int) -> dict:
-    # Use high-resolution timestamp to guarantee uniqueness across parallel calls.
-    unique_dir = f"/nemo_run/ioi_run_{worker_id}_{os.getpid()}_{time.time_ns()}"
-
-    try:
-        # 1. Create all necessary files locally (sandbox shares filesystem)
-        os.makedirs(unique_dir, exist_ok=True)
-        for filepath, content in task_args.get("run_files", []):
-            target_path = os.path.join(unique_dir, os.path.basename(filepath))
-            with open(target_path, "w", encoding="utf-8") as f:
-                f.write(content)
-        for fname in ("compile", "run"):
-            fpath = os.path.join(unique_dir, fname)
-            if os.path.exists(fpath):
-                os.chmod(fpath, 0o755)
-        # Write contestant solution into problem solution file
-        solution_path = os.path.join(unique_dir, f"{task_args['problem_id']}.cpp")
-        with open(solution_path, "w", encoding="utf-8") as f:
-            f.write(task_args["generated_code"])
-        # Prepare only input file (no ground-truth for input-only runs)
-        with open(os.path.join(unique_dir, "input.txt"), "w", encoding="utf-8") as f:
-            f.write(task_args["test_input"])
-
-        # 2. Compile using run_files toolchain
-        compile_command = f"cd {unique_dir} && ./compile"
-        sandbox = LocalSandbox()
-        compile_result, _ = worker_loop.run_until_complete(
-            sandbox.execute_code(compile_command, language="shell", timeout=120)
-        )
-
-        result = {
-            "compile_success": not compile_result.get("stderr"),
-            "compile_stdout": compile_result.get("stdout", ""),
-            "compile_stderr": compile_result.get("stderr", ""),
-            "run_stdout": "",
-            "run_stderr": "",
-            "error": "",
-        }
-
-        if not result["compile_success"]:
-            return result
-
-        # 3. Run the code using run_files runner
-        run_command = f"cd {unique_dir} && ./run < input.txt"
-        run_result, _ = worker_loop.run_until_complete(
-            sandbox.execute_code(run_command, language="shell", timeout=120, max_output_characters=1000000)
-        )
-
-        run_stdout = sha256_hex(run_result.get("stdout", ""))
-        run_stderr = run_result.get("stderr", "")
-
-        result.update(
-            {
-                "run_stdout": run_stdout,
-                "run_stderr": run_stderr,
-            }
-        )
-
-        return result
-
-    except Exception as e:
-        return {"run_stdout": "", "run_stderr": "", "error": str(e)}
-
-    finally:
-        # 4. Clean up the directory locally
+        # 4. Clean up the directory
+        # Fire and forget; ignore return values
         try:
-            shutil.rmtree(unique_dir, ignore_errors=True)
+            sandbox = LocalSandbox()
+            worker_loop.run_until_complete(sandbox.execute_code(f"rm -rf {unique_dir}", language="shell", timeout=120))
         except Exception:
             pass
 
@@ -309,11 +250,10 @@ def __init__(self, config: dict, num_parallel_requests: int = 10):
         self.eval_cfg = IOIEvaluatorConfig(_init_nested=True, **config)
 
         # Heavy runtime resources are lazily initialized within _evaluate_entry.
-        self.sandbox = None
-        self.metadata = None
-        self.inputdata = None
-        self.precompiled_cache = {}
-        self.pool = None
+        self.sandbox = None  # type: ignore
+        self.metadata = None  # type: ignore
+        self.precompiled_cache: Dict[str, str] = {}
+        self.pool = None  # type: ignore
 
     async def _initialize_runtime(self):
         """Asynchronously create sandbox and related runtime state on first use."""
@@ -335,23 +275,14 @@ def _setup():
                 )
             with open(self.eval_cfg.test_file, "r") as f:
                 metadata_local = json.load(f)
-            input_local = None
-            if self.eval_cfg.input_file:
-                if not os.path.exists(self.eval_cfg.input_file):
-                    raise FileNotFoundError(
-                        f"Input file {self.eval_cfg.input_file} does not exist."
-                        " Please provide a valid parameter for ++eval_config.input_file=x when running IOI Evaluation."
-                    )
-                with open(self.eval_cfg.input_file, "r") as f:
-                    input_local = json.load(f)
             pool_local = multiprocessing.Pool(
                 processes=self.eval_cfg.test_batch_size,
                 initializer=init_worker,
             )
 
-            return sbox, metadata_local, input_local, pool_local
+            return sbox, metadata_local, pool_local
 
-        self.sandbox, self.metadata, self.inputdata, self.pool = await asyncio.to_thread(_setup)
+        self.sandbox, self.metadata, self.pool = await asyncio.to_thread(_setup)
 
     # Internal helper
     async def _evaluate_entry(self, entry: dict) -> dict:
@@ -367,10 +298,9 @@ async def _evaluate_entry(self, entry: dict) -> dict:
         compile_code = subtask_meta["compile"]
         run_code = subtask_meta["run"]
         grader_files = subtask_meta["grader_files"]
-        run_files = subtask_meta.get("run_files", [])
 
         if pid not in self.precompiled_cache:
-            grader_dir = await asyncio.to_thread(
+            self.precompiled_cache[pid] = await asyncio.to_thread(
                 _precompile_grader,
                 pid,
                 grader_files,
@@ -378,8 +308,7 @@ async def _evaluate_entry(self, entry: dict) -> dict:
                 run_code,
                 self.sandbox,
             )
-            self.precompiled_cache[pid] = {"grader": grader_dir}
-        pre_dir = self.precompiled_cache[pid]["grader"]
+        pre_dir = self.precompiled_cache[pid]
 
         subtask_state = {
             st: {
@@ -439,53 +368,25 @@ async def _evaluate_entry(self, entry: dict) -> dict:
             score = round(min(data["scores"]) * data["score"], data["precision"]) if data["scores"] else 0.0
             test_case_results[st] = {"score": score, "outputs": data["outputs"]}
 
-        # Optionally run custom input cases
-        input_outputs = []
-        if self.inputdata is not None:
-            problem_inputs = self.inputdata[str(entry["id"])]
-            for i in range(0, len(problem_inputs), batch_size):
-                batch = problem_inputs[i : i + batch_size]
-                tasks = []
-                for test_data in batch:
-                    tasks.append(
-                        {
-                            "generated_code": completion,
-                            "problem_id": pid,
-                            "run_files": run_files,
-                            "test_input": test_data["content"],
-                        }
-                    )
-                # map with unique worker id argument
-                results = await asyncio.to_thread(
-                    self.pool.starmap, run_input_case, [(ta, idx) for idx, ta in enumerate(tasks)]
-                )
-                for test_data, result in zip(batch, results):
-                    test_name = test_data["file_name"]
-                    test_type = "input"
-                    result["test_name"] = test_name
-                    result["test_type"] = test_type
-                    input_outputs.append(result)
-
         return {
             "name": entry["name"],
             "subtask": entry["subtask"],
             "test_case_results": test_case_results,
-            "input_case_results": input_outputs,
         }
 
-    async def eval_full(self, input_files):  # type: ignore[override]
-        for jsonl_file in unroll_files(input_files):
-            with open(jsonl_file, "r", encoding="utf-8") as f:
-                all_samples = [json.loads(line) for line in f]
+    async def eval_full(self):  # type: ignore[override]
+        jsonl_file = self.eval_cfg.input_file
+        with open(jsonl_file, "r", encoding="utf-8") as f:
+            all_samples = [json.loads(line) for line in f]
 
-            tasks = [self._evaluate_entry(s) for s in all_samples]
-            outputs = await asyncio.gather(*tasks)
+        tasks = [self._evaluate_entry(s) for s in all_samples]
+        outputs = await asyncio.gather(*tasks)
 
-            for s, o in zip(all_samples, outputs):
-                s["test_case_results"] = o["test_case_results"]
-                s["input_case_results"] = o["input_case_results"]
+        for s, o in zip(all_samples, outputs):
+            s["test_case_results"] = o["test_case_results"]
+            s["eval_status"] = o["eval_status"]
 
-            jdump(all_samples, jsonl_file, mode="wt")
+        jdump(all_samples, jsonl_file, mode="wt")
 
         if self.pool is not None:
             self.pool.close()
diff --git a/nemo_skills/evaluation/metrics/ioi_metrics.py b/nemo_skills/evaluation/metrics/ioi_metrics.py
index 4f4431a3bd..a2028f6a6d 100644
--- a/nemo_skills/evaluation/metrics/ioi_metrics.py
+++ b/nemo_skills/evaluation/metrics/ioi_metrics.py
@@ -11,26 +11,15 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-import json
-import os
-import re
 from collections import defaultdict
 
 from nemo_skills.evaluation.metrics.base import BaseMetrics
 
 
-def extract_final_cpp_block(text):
-    pattern = r"```(?:cpp|Cpp)\s*\n(.*?)```"
-    matches = re.findall(pattern, text, re.DOTALL)
-    return matches[-1] if matches else ""
-
-
 class IOIMetrics(BaseMetrics):
-    def __init__(self, **kwargs):
+    def __init__(self):
         super().__init__()
         self.reset()
-        self.cluster_folder = kwargs.get("cluster_folder", None)
-        print(f"Cluster folder: {self.cluster_folder}")
 
     def update(self, predictions):
         super().update(predictions)
@@ -41,54 +30,6 @@ def update(self, predictions):
     def _get_score_dict(self, p):
         return {"correct": all(r["score"] > 0 for r in p["test_case_results"].values())}
 
-    def extract_info(self, submission) -> dict:
-        # Aggregate IOI per-submission scores for convenience
-        subtask_scores = [v["score"] for _, v in submission["test_case_results"].items()]
-        return {
-            "grade": subtask_scores,
-            "tokens": submission["num_generated_tokens"],
-            "code": extract_final_cpp_block(submission["generation"]),
-        }
-
-    def get_clusters(self, submissions) -> dict:
-        clusters = defaultdict(list)
-        id = 0
-
-        for submission in submissions:
-            input_results = submission.get("input_case_results", [])
-            run_outputs = []
-            for output in input_results:
-                if "run_stdout" not in output:
-                    continue
-                run_outputs.append(output["run_stdout"])
-            output_key = tuple(run_outputs)
-
-            extract_info = self.extract_info(submission)
-            if output_key not in clusters:
-                # Initialize per-subtask maxima and counts with this submission's scores
-                subtask_score_list = [res["score"] for _, res in submission["test_case_results"].items()]
-                clusters[output_key] = {
-                    "codes": [],
-                    "max_score": subtask_score_list[:],
-                    "max_score_solutions": [1] * len(subtask_score_list),
-                }
-            else:
-                # Update maxima and counts element-wise from this submission
-                subtask_score_list = [res["score"] for _, res in submission["test_case_results"].items()]
-                max_scores = clusters[output_key]["max_score"]
-                max_counts = clusters[output_key]["max_score_solutions"]
-                for idx, score_val in enumerate(subtask_score_list):
-                    if score_val > max_scores[idx]:
-                        max_scores[idx] = score_val
-                        max_counts[idx] = 1
-                    elif score_val == max_scores[idx]:
-                        max_counts[idx] += 1
-            clusters[output_key]["codes"].append(extract_info)
-
-            id = submission.get("id", id)
-
-        return clusters, id
-
     def get_problem_score(self, submissions) -> float:
         """
         For a given problem (list of submissions), compute the score as follows:
@@ -96,7 +37,7 @@ def get_problem_score(self, submissions) -> float:
           - Sum these maximum scores to get the problem score.
         """
         if not submissions:
-            return 0.0, {}
+            return 0.0
         subtask_scores = {}
 
         for submission in submissions:
@@ -104,70 +45,63 @@ def get_problem_score(self, submissions) -> float:
                 subtask_scores[subtask] = max(subtask_scores.get(subtask, 0), result["score"])
         return sum(subtask_scores.values()), subtask_scores
 
+    def simulate_round_robin_score(self, submissions) -> float:
+        """
+        Computes a round robin score for a problem.
+        The procedure is as follows:
+         1. For each submission, compute an aggregate score (sum of subtask scores).
+         2. Sort submissions in descending order by the aggregate score.
+         3. Select up to 50 submissions.
+         4. For each subtask, take the maximum score among the selected submissions.
+         5. Return the sum of these maximum subtask scores.
+        """
+        if not submissions:
+            return 0.0
+
+        # compute an aggregate score per submission
+        for submission in submissions:
+            aggregate_score = sum(result["score"] for result in submission["test_case_results"].values())
+            submission["_aggregate_score"] = aggregate_score
+
+        # sort submissions in descending order by aggregate score
+        sorted_submissions = sorted(submissions, key=lambda s: s["_aggregate_score"], reverse=True)
+        # Select up to 50 submissions.
+        selected = sorted_submissions[:50]
+
+        # for each subtask, take the maximum score among the selected submissions
+        subtask_scores = {}
+        for submission in selected:
+            for subtask, result in submission["test_case_results"].items():
+                subtask_scores[subtask] = max(subtask_scores.get(subtask, 0), result["score"])
+        return sum(subtask_scores.values())
+
     def get_metrics(self):
-        total_score = 0.0
+        total_score = total_round_robin = 0.0
         self.problem_scores = {}
         for name, submissions in self.predictions_by_problem.items():
-            # Cluster the submissions if requested
-            if self.cluster_folder:
-                os.makedirs(self.cluster_folder, exist_ok=True)
-                submissions_by_id = defaultdict(list)
-                for sub in submissions:
-                    submissions_by_id[sub["id"]].append(sub)
-                for sid, sid_submissions in submissions_by_id.items():
-                    clusters, _ = self.get_clusters(sid_submissions)
-                    final_clusters = {}
-                    for i, (output_key, cluster) in enumerate(clusters.items()):
-                        final_clusters[f"cluster_{i + 1}"] = {
-                            "output": output_key,
-                            "codes": cluster["codes"],
-                            "max_score": cluster["max_score"],
-                            "max_score_solutions": cluster["max_score_solutions"],
-                        }
-                    output_file = os.path.join(self.cluster_folder, f"{sid}_cluster.jsonl")
-                    with open(output_file, "w") as f:
-                        json.dump(final_clusters, f, indent=4)
-
             score, subtasks = self.get_problem_score(submissions)
             self.problem_scores[name] = (score, subtasks)
             total_score += score
-
-        per_problem_subtask_scores = {}
-        for name, (achieved_total, achieved_subtasks) in self.problem_scores.items():
-            submissions = self.predictions_by_problem[name]
-            max_subtasks = {}
-            for sub in submissions:
-                max_subtasks[sub["subtask"]] = sub["subtask_score"]
-            max_total = sum(max_subtasks.values())
-            per_problem_subtask_scores[name] = {
-                "total": {"score": achieved_total, "max_score": max_total},
-                "subtasks": {
-                    subtask: {"score": achieved, "max_score": max_subtasks[subtask]}
-                    for subtask, achieved in achieved_subtasks.items()
-                },
-            }
-
+            total_round_robin += self.simulate_round_robin_score(submissions)
+        self.print_problem_scores()
         metrics_dict = super().get_metrics()
         for m in metrics_dict.values():
-            m["total_score"] = int(total_score)
-            m["per_problem_subtask_scores"] = per_problem_subtask_scores
-        self.per_problem_subtask_scores = per_problem_subtask_scores
-        self.print_problem_scores()
+            m["total_score"], m["round_robin_score"] = str(total_score), str(total_round_robin)
         return metrics_dict
 
     def reset(self):
         super().reset()
         self.predictions_by_problem = defaultdict(list)
         self.problem_scores = {}
-        self.per_problem_subtask_scores = {}
-
-    def evaluations_to_print(self):
-        return [f"pass@{self.max_k}"]
 
     def print_problem_scores(self):
         print("---------------------------------Problem and subtask scores---------------------------------")
-        for name, info in self.per_problem_subtask_scores.items():
-            total = info["total"]
-            print(f"# {name}: {int(total['score'])}/{int(total['max_score'])}")
-            for subtask, subinfo in info["subtasks"].items():
-                print(f"  {subtask}: {int(subinfo['score'])}/{int(subinfo['max_score'])}")
+        for name, (achieved_total, achieved_subtasks) in self.problem_scores.items():
+            submissions = self.predictions_by_problem[name]
+            max_subtasks = {}
+            for sub in submissions:
+                max_subtasks[sub["subtask"]] = sub["subtask_score"]
+            max_total = sum(max_subtasks.values())
+            print(f"# {name}: {achieved_total}/{max_total}")
+            for subtask, achieved in achieved_subtasks.items():
+                print(f"  {subtask}: {achieved}/{max_subtasks[subtask]}")