NVIDIA-NeMo · Kipok · Oct 1, 2025 · Sep 25, 2025 · Sep 25, 2025 · Sep 25, 2025
diff --git a/dockerfiles/Dockerfile.sandbox b/dockerfiles/Dockerfile.sandbox
@@ -15,14 +15,19 @@
 # Use the base image with Python 3.10 and Flask
 FROM tiangolo/uwsgi-nginx-flask:python3.10
 
-# Install dependencies required for Lean 4 and other tools
+# Install dependencies required for Lean 4, pypy3, and other tools
 RUN apt-get update && \
-    apt-get install -y curl git net-tools bzip2 && \
+    apt-get install -y curl git net-tools bzip2 build-essential libseccomp-dev && \
     curl -L https://downloads.python.org/pypy/pypy3.10-v7.3.17-linux64.tar.bz2 -o /tmp/pypy.tar.bz2 && \
     tar -xjf /tmp/pypy.tar.bz2 -C /opt/ && \
-    ln -s /opt/pypy3.10-v7.3.17-linux64/bin/pypy3 /usr/local/bin/pypy3 && \
+    ln -s /opt/pypy3.10-v7.3.17-linux64/bin/pypy3 /usr/bin/pypy3 && \
+    /usr/bin/pypy3 -m ensurepip && \
     rm /tmp/pypy.tar.bz2
 
+# Install the DMOJ judge-server using pip (OJBench eval requirement)
+RUN pip install git+https://github.com/DMOJ/judge-server.git@11bf2cd03df83f0df5970a08e98b4cec2dfaecd5
+
+# Install Lean 4 toolchain
 RUN curl https://raw.githubusercontent.com/leanprover/elan/master/elan-init.sh -sSf | sh -s -- -y && \
     /root/.elan/bin/elan toolchain install leanprover/lean4:v4.12.0 && \
     /root/.elan/bin/elan default leanprover/lean4:v4.12.0 && \

diff --git a/docs/evaluation/code.md b/docs/evaluation/code.md
@@ -201,4 +201,66 @@ all you need to do is replace `openhands` with `swe_agent` in the command above.
 ### livebench-coding
 
 - Benchmark is defined in [`nemo_skills/dataset/livebench-coding/__init__.py`](https://github.com/NVIDIA/NeMo-Skills/blob/main/nemo_skills/dataset/livebench-coding/__init__.py)
-- Original benchmark source is [here](https://huggingface.co/datasets/livebench/coding).
+- Original benchmark source is [here](https://huggingface.co/datasets/livebench/coding).
+
+### OJBench
+
+- Benchmark is defined in [`nemo_skills/dataset/ojbench/__init__.py`](https://github.com/NVIDIA/NeMo-Skills/blob/main/nemo_skills/dataset/ojbench/__init__.py)
+- Original benchmark source is [here](https://github.com/He-Ren/OJBench/tree/main).
+
+#### Data preparation
+
+Before running ns eval, you will need to prepare the data with this command:
+
+```
+ns prepare_data --data_dir=<DATA_DIR> --cluster=<CLUSTER_NAME> ojbench
+```
+
+We encourage to download OJBench data into a Slurm cluster location because 15GB data will be downloaded by cloning [huggingface.co/datasets/He-Ren/OJBench_testdata](https://huggingface.co/datasets/He-Ren/OJBench_testdata). Two files will be created at `<DATA_DIR>` named `test_python.jsonl` and `test_cpp.jsonl`. Note that, data downloading require `HF_TOKEN` to be in the environment variables.
+
+#### Sample run
+
+Here's how to run a sample evaluation of [Qwen3-32B](https://huggingface.co/Qwen/Qwen3-32B) on a Slurm cluster.
+
+1. Prepare the data following instructions in the previous section.
+2. Run
+```
+ns eval \
+    --cluster=<CLUSTER_NAME> \
+    --model=Qwen/Qwen3-32B \
+    --server_type=vllm \
+    --server_nodes=1 \
+    --server_gpus=8 \
+    --benchmarks=ojbench \
+    --split=test_python \
+    --data_dir=<DATA_DIR> \
+    --output_dir=<OUTPUT_DIR> \
+    ++inference.temperature=0.6 \
+    ++inference.top_p=0.95 \
+    ++inference.tokens_to_generate=32768
+```
+replacing <...> with your desired parameters.
+
+After all jobs are complete, you can check the results in `<OUTPUT_DIR>/eval-results/ojbench/metrics.json`. You can also take a look at `<OUTPUT_DIR>/eval-results/ojbench/summarized-results/main_*` They should look something like this:
+```
+----------------------------- ojbench -----------------------------
+evaluation_mode | num_entries | avg_tokens | gen_seconds | accuracy
+pass@1          | 232         | 19628      | 2201        | 27.16%
+
+
+--------------------------- ojbench-easy --------------------------
+evaluation_mode | num_entries | avg_tokens | gen_seconds | accuracy
+pass@1          | 36          | 12052      | 1729        | 72.22%
+
+
+--------------------------- ojbench-hard --------------------------
+evaluation_mode | num_entries | avg_tokens | gen_seconds | accuracy
+pass@1          | 117         | 22585      | 2191        | 5.13%
+
+
+-------------------------- ojbench-medium -------------------------
+evaluation_mode | num_entries | avg_tokens | gen_seconds | accuracy
+pass@1          | 79          | 18701      | 2201        | 39.24%
+```
+
+Keep in mind there is some variance between runs, so we recommend running evaluation multiple times and averaging out the resolve rate. To do that automatically, you can set `--benchmarks=ojbench:N`, where N is your desired number of repeats.
diff --git a/nemo_skills/dataset/ojbench/__init__.py b/nemo_skills/dataset/ojbench/__init__.py
@@ -0,0 +1,22 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# settings that define how evaluation should be done by default (all can be changed from cmdline)
+DATASET_GROUP = "code"
+METRICS_TYPE = "ojbench"
+EVAL_SPLIT = "test_python"
+EVAL_ARGS = "++eval_type=ojbench"
+REQUIRES_SANDBOX = True
+KEEP_MOUNTS_FOR_SANDBOX = True
+GENERATION_ARGS = "++prompt_config=generic/default"
diff --git a/nemo_skills/dataset/ojbench/prepare.py b/nemo_skills/dataset/ojbench/prepare.py
@@ -0,0 +1,91 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import json
+import os
+import shutil
+import subprocess
+import sys
+from pathlib import Path
+
+REPO_URL = "https://huggingface.co/datasets/He-Ren/OJBench_testdata"
+HF_TOKEN = os.environ.get("HF_TOKEN")
+if not HF_TOKEN:
+    print("❌ Error: Hugging Face token not found.", file=sys.stderr)
+    print("   Please set the HF_TOKEN environment variable with your access token.", file=sys.stderr)
+    print("   You can create a token at: https://huggingface.co/settings/tokens", file=sys.stderr)
+    sys.exit(1)
+
+
+def clone_dataset_repo(url, destination):
+    if not shutil.which("git"):
+        print("❌ Error: Git executable not found. Please install Git.", file=sys.stderr)
+        sys.exit(1)
+
+    try:
+        if destination.exists() or destination.is_symlink():
+            print(f"Destination '{destination}' already exists. Removing it...")
+            if destination.is_dir():
+                shutil.rmtree(destination)
+            else:
+                destination.unlink()
+
+        auth_url = url.replace("https://huggingface.co/", f"https://user:{HF_TOKEN}@huggingface.co/", 1)
+        print(f"Cloning {url} into {destination}...")
+        subprocess.run(["git", "clone", auth_url, destination], check=True, capture_output=True)
+
+        print("✅ Git clone is successful.")
+
+    except subprocess.CalledProcessError as e:
+        print("❌ Git command failed:", file=sys.stderr)
+        cmd = [url if i == 2 else arg for i, arg in enumerate(e.cmd)]
+        print(f"   Command: {' '.join(map(str, cmd))}", file=sys.stderr)
+        stderr = e.stderr.decode().strip()
+        stderr = stderr.replace(HF_TOKEN, "***") if HF_TOKEN else stderr
+        print(f"   Stderr: {stderr}", file=sys.stderr)
+        sys.exit(1)
+
+
+if __name__ == "__main__":
+    data_dir = Path(__file__).absolute().parent
+    data_dir.mkdir(exist_ok=True)
+    destination = data_dir / "OJBench_testdata"
+    clone_dataset_repo(REPO_URL, destination)
+
+    source_file = destination / "prompts" / "full.jsonl"
+    python_target_file = data_dir / "test_python.jsonl"
+    cpp_target_file = data_dir / "test_cpp.jsonl"
+
+    print(f"Processing '{source_file}' and splitting into Python and C++ subsets...")
+    processed_lines = 0
+    try:
+        with (
+            source_file.open("r", encoding="utf-8") as infile,
+            python_target_file.open("w", encoding="utf-8") as outfile_py,
+            cpp_target_file.open("w", encoding="utf-8") as outfile_cpp,
+        ):
+            for line in infile:
+                data = json.loads(line)
+                data["question"] = data.pop("prompt")
+                data["subset_for_metrics"] = data["difficulty"]
+                if data["language"] == "python":
+                    outfile_py.write(json.dumps(data) + "\n")
+                elif data["language"] == "cpp":
+                    outfile_cpp.write(json.dumps(data) + "\n")
+                processed_lines += 1
+        print(f"✅ Successfully processed {processed_lines} lines.")
+
+    except (FileNotFoundError, json.JSONDecodeError, OSError) as e:
+        print(f"❌ Error during file processing: {e}", file=sys.stderr)
+        sys.exit(1)
diff --git a/nemo_skills/evaluation/evaluator/__init__.py b/nemo_skills/evaluation/evaluator/__init__.py
@@ -34,6 +34,7 @@
 )
 from nemo_skills.evaluation.evaluator.mcq import eval_mcq
 from nemo_skills.evaluation.evaluator.mrcr import eval_mrcr
+from nemo_skills.evaluation.evaluator.ojbench import eval_ojbench
 from nemo_skills.evaluation.evaluator.ruler import eval_ruler
 from nemo_skills.evaluation.evaluator.scicode import eval_scicode
 
@@ -58,6 +59,7 @@ def dummy_eval(cfg):
     "mrcr": eval_mrcr,
     "ioi": eval_ioi,
     "bigcodebench": eval_bigcodebench,
+    "ojbench": eval_ojbench,
 }
 
 # Evaluator class mapping

diff --git a/nemo_skills/evaluation/evaluator/ojbench.py b/nemo_skills/evaluation/evaluator/ojbench.py
@@ -0,0 +1,138 @@
+# Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+import asyncio
+import json
+import logging
+import shlex
+import textwrap
+from contextlib import asynccontextmanager
+from dataclasses import field
+from pathlib import Path
+
+from nemo_skills.code_execution.sandbox import get_sandbox
+from nemo_skills.evaluation.evaluator.code import preprocess_code
+from nemo_skills.utils import get_logger_name, nested_dataclass, unroll_files
+
+LOG = logging.getLogger(get_logger_name(__file__))
+
+
+@nested_dataclass(kw_only=True)
+class OJBenchConfig:
+    sandbox: dict = field(default_factory=lambda: {"sandbox_type": "local"})
+    timeout: int = 6
+
+
+@asynccontextmanager
+async def sandbox_context(config: dict):
+    sandbox = get_sandbox(**config)
+    try:
+        yield sandbox
+    finally:
+        LOG.info("Closing sandbox...")
+        await sandbox.close()
+
+
+async def install_packages(eval_config: OJBenchConfig) -> bool:
+    """Helper to install packages inside the sandbox."""
+
+    async with sandbox_context(eval_config.sandbox) as sandbox:
+        LOG.info("Installing required packages for ojbench evaluation...")
+
+        clone_cmd = "git clone https://github.com/He-Ren/OJBench.git"
+        result, _ = await sandbox.execute_code(clone_cmd, language="shell", timeout=300)
+        if result["process_status"] != "completed":
+            stderr = result.get("stderr", "Unknown error")
+            raise RuntimeError(f"Failed to clone OJBench repo: {stderr}")
+
+        install_cmd = "pip install -e OJBench"
+        result, _ = await sandbox.execute_code(install_cmd, language="shell", timeout=300)
+        if result["process_status"] != "completed":
+            stderr = result.get("stderr", "Unknown error")
+            raise RuntimeError(f"Failed to install ojbench. Stderr: {stderr}")
+
+        LOG.info("Successfully installed ojbench.")
+
+
+async def eval_ojbench_async(cfg):
+    eval_config = OJBenchConfig(**cfg.eval_config)
+    problem_dirs = [
+        Path(cfg.data_dir, "ojbench/OJBench_testdata/NOI"),
+        Path(cfg.data_dir, "ojbench/OJBench_testdata/ICPC"),
+    ]
+
+    await install_packages(eval_config)
+
+    async with sandbox_context(eval_config.sandbox) as sandbox:
+        for jsonl_file_str in unroll_files(cfg.input_files):
+            jsonl_file = Path(jsonl_file_str)
+            with open(jsonl_file, encoding="utf-8") as f_in:
+                samples = []
+                for line in f_in:
+                    sample = json.loads(line)
+                    sample = preprocess_code(sample, sample["language"], strip_whitespace=True)
+                    sample["prompt"] = sample.pop("question")
+                    sample["content"] = f"```{sample['language']}\n{sample['completion']}\n```"
+                    sample.pop("completion")
+                    samples.append(sample)
+
+            input_filename = jsonl_file.name.replace("output-", "eval-input-", 1)
+            eval_input_file = jsonl_file.with_name(input_filename)
+            results_filename = jsonl_file.name.replace("output-", "eval-results-", 1)
+            eval_results_file = jsonl_file.with_name(results_filename)
+
+            with open(eval_input_file, "w", encoding="utf-8") as f_out:
+                f_out.writelines(json.dumps(sample) + "\n" for sample in samples)
+
+            eval_code = textwrap.dedent(f"""
+                import ojbench
+                ojbench.init(problem_dirs={repr([str(p) for p in problem_dirs])})
+                ojbench.judge_jsonl(
+                    input_path={repr(str(eval_input_file))},
+                    output_path={repr(str(eval_results_file))},
+                    num_workers=16
+                )
+            """)
+
+            cmd = f'env -i PATH="/usr/local/bin:/usr/bin:/bin" python3 -c {shlex.quote(eval_code)}'
+            output, _ = await sandbox.execute_code(
+                cmd,
+                language="shell",
+                timeout=eval_config.timeout * len(samples) + 60,
+                max_output_characters=100_000,
+            )
+
+            if output.get("process_status") != "completed":
+                raise RuntimeError(f"Evaluation failed for {jsonl_file}. Stderr: {output.get('stderr')}")
+
+            with open(eval_results_file, "rt", encoding="utf-8") as fin:
+                results = [json.loads(line) for line in fin]
+
+            if len(results) != len(samples):
+                LOG.error(f"Result count mismatch for {jsonl_file}: {len(results)} results vs {len(samples)} samples")
+                continue
+
+            for sample, result in zip(samples, results, strict=True):
+                sample["verdict"] = result["verdict"]
+                sample["is_passed"] = result["is_passed"]
+
+            with open(jsonl_file, "w", encoding="utf-8") as f:
+                for sample in samples:
+                    f.write(json.dumps(sample) + "\n")
+
+
+def eval_ojbench(cfg):
+    """Synchronous wrapper to run the async evaluation."""
+    asyncio.run(eval_ojbench_async(cfg))
diff --git a/nemo_skills/evaluation/metrics/code_metrics.py b/nemo_skills/evaluation/metrics/code_metrics.py
@@ -109,3 +109,15 @@ def get_incorrect_sample(self, prediction: dict) -> dict:
     def update(self, predictions):
         super().update(predictions)
         self._compute_pass_at_k(predictions=predictions)
+
+
+class OJBenchMetrics(BaseMetrics):
+    def _get_score_dict(self, prediction: dict) -> dict[str, bool | int | float]:
+        return {"accuracy": prediction["is_passed"]}
+
+    def get_incorrect_sample(self, prediction: dict) -> dict:
+        return {"is_passed": False}
+
+    def update(self, predictions):
+        super().update(predictions)
+        self._compute_pass_at_k(predictions=predictions)
diff --git a/nemo_skills/evaluation/metrics/map_metrics.py b/nemo_skills/evaluation/metrics/map_metrics.py
@@ -21,6 +21,7 @@
     BigCodeBenchMetrics,
     EvalPlusMetrics,
     LiveCodeBenchMetrics,
+    OJBenchMetrics,
     SciCodeMetrics,
     SweBenchMetrics,
 )
@@ -53,6 +54,7 @@
     "mrcr": MRCRMetrics,
     "aalcr": AALCRMetrics,
     "livebench_coding": LiveCodeBenchMetrics,
+    "ojbench": OJBenchMetrics,
 }