diff --git a/dockerfiles/Dockerfile.sandbox b/dockerfiles/Dockerfile.sandbox index f45b626f84..9a6385ab4b 100644 --- a/dockerfiles/Dockerfile.sandbox +++ b/dockerfiles/Dockerfile.sandbox @@ -15,14 +15,19 @@ # Use the base image with Python 3.10 and Flask FROM tiangolo/uwsgi-nginx-flask:python3.10 -# Install dependencies required for Lean 4 and other tools +# Install dependencies required for Lean 4, pypy3, and other tools RUN apt-get update && \ - apt-get install -y curl git net-tools bzip2 && \ + apt-get install -y curl git net-tools bzip2 build-essential libseccomp-dev && \ curl -L https://downloads.python.org/pypy/pypy3.10-v7.3.17-linux64.tar.bz2 -o /tmp/pypy.tar.bz2 && \ tar -xjf /tmp/pypy.tar.bz2 -C /opt/ && \ - ln -s /opt/pypy3.10-v7.3.17-linux64/bin/pypy3 /usr/local/bin/pypy3 && \ + ln -s /opt/pypy3.10-v7.3.17-linux64/bin/pypy3 /usr/bin/pypy3 && \ + /usr/bin/pypy3 -m ensurepip && \ rm /tmp/pypy.tar.bz2 +# Install the DMOJ judge-server using pip (OJBench eval requirement) +RUN pip install git+https://github.com/DMOJ/judge-server.git@11bf2cd03df83f0df5970a08e98b4cec2dfaecd5 + +# Install Lean 4 toolchain RUN curl https://raw.githubusercontent.com/leanprover/elan/master/elan-init.sh -sSf | sh -s -- -y && \ /root/.elan/bin/elan toolchain install leanprover/lean4:v4.12.0 && \ /root/.elan/bin/elan default leanprover/lean4:v4.12.0 && \ diff --git a/docs/evaluation/code.md b/docs/evaluation/code.md index 596b133b75..59e7054af3 100644 --- a/docs/evaluation/code.md +++ b/docs/evaluation/code.md @@ -201,4 +201,66 @@ all you need to do is replace `openhands` with `swe_agent` in the command above. ### livebench-coding - Benchmark is defined in [`nemo_skills/dataset/livebench-coding/__init__.py`](https://github.com/NVIDIA/NeMo-Skills/blob/main/nemo_skills/dataset/livebench-coding/__init__.py) -- Original benchmark source is [here](https://huggingface.co/datasets/livebench/coding). \ No newline at end of file +- Original benchmark source is [here](https://huggingface.co/datasets/livebench/coding). + +### OJBench + +- Benchmark is defined in [`nemo_skills/dataset/ojbench/__init__.py`](https://github.com/NVIDIA/NeMo-Skills/blob/main/nemo_skills/dataset/ojbench/__init__.py) +- Original benchmark source is [here](https://github.com/He-Ren/OJBench/tree/main). + +#### Data preparation + +Before running ns eval, you will need to prepare the data with this command: + +``` +ns prepare_data --data_dir= --cluster= ojbench +``` + +We encourage to download OJBench data into a Slurm cluster location because 15GB data will be downloaded by cloning [huggingface.co/datasets/He-Ren/OJBench_testdata](https://huggingface.co/datasets/He-Ren/OJBench_testdata). Two files will be created at `` named `test_python.jsonl` and `test_cpp.jsonl`. Note that, data downloading require `HF_TOKEN` to be in the environment variables. + +#### Sample run + +Here's how to run a sample evaluation of [Qwen3-32B](https://huggingface.co/Qwen/Qwen3-32B) on a Slurm cluster. + +1. Prepare the data following instructions in the previous section. +2. Run +``` +ns eval \ + --cluster= \ + --model=Qwen/Qwen3-32B \ + --server_type=vllm \ + --server_nodes=1 \ + --server_gpus=8 \ + --benchmarks=ojbench \ + --split=test_python \ + --data_dir= \ + --output_dir= \ + ++inference.temperature=0.6 \ + ++inference.top_p=0.95 \ + ++inference.tokens_to_generate=32768 +``` +replacing <...> with your desired parameters. + +After all jobs are complete, you can check the results in `/eval-results/ojbench/metrics.json`. You can also take a look at `/eval-results/ojbench/summarized-results/main_*` They should look something like this: +``` +----------------------------- ojbench ----------------------------- +evaluation_mode | num_entries | avg_tokens | gen_seconds | accuracy +pass@1 | 232 | 19628 | 2201 | 27.16% + + +--------------------------- ojbench-easy -------------------------- +evaluation_mode | num_entries | avg_tokens | gen_seconds | accuracy +pass@1 | 36 | 12052 | 1729 | 72.22% + + +--------------------------- ojbench-hard -------------------------- +evaluation_mode | num_entries | avg_tokens | gen_seconds | accuracy +pass@1 | 117 | 22585 | 2191 | 5.13% + + +-------------------------- ojbench-medium ------------------------- +evaluation_mode | num_entries | avg_tokens | gen_seconds | accuracy +pass@1 | 79 | 18701 | 2201 | 39.24% +``` + +Keep in mind there is some variance between runs, so we recommend running evaluation multiple times and averaging out the resolve rate. To do that automatically, you can set `--benchmarks=ojbench:N`, where N is your desired number of repeats. diff --git a/nemo_skills/dataset/ojbench/__init__.py b/nemo_skills/dataset/ojbench/__init__.py new file mode 100644 index 0000000000..73c5083480 --- /dev/null +++ b/nemo_skills/dataset/ojbench/__init__.py @@ -0,0 +1,22 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# settings that define how evaluation should be done by default (all can be changed from cmdline) +DATASET_GROUP = "code" +METRICS_TYPE = "ojbench" +EVAL_SPLIT = "test_python" +EVAL_ARGS = "++eval_type=ojbench" +REQUIRES_SANDBOX = True +KEEP_MOUNTS_FOR_SANDBOX = True +GENERATION_ARGS = "++prompt_config=generic/default" diff --git a/nemo_skills/dataset/ojbench/prepare.py b/nemo_skills/dataset/ojbench/prepare.py new file mode 100644 index 0000000000..3f019d6c28 --- /dev/null +++ b/nemo_skills/dataset/ojbench/prepare.py @@ -0,0 +1,91 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import json +import os +import shutil +import subprocess +import sys +from pathlib import Path + +REPO_URL = "https://huggingface.co/datasets/He-Ren/OJBench_testdata" +HF_TOKEN = os.environ.get("HF_TOKEN") +if not HF_TOKEN: + print("❌ Error: Hugging Face token not found.", file=sys.stderr) + print(" Please set the HF_TOKEN environment variable with your access token.", file=sys.stderr) + print(" You can create a token at: https://huggingface.co/settings/tokens", file=sys.stderr) + sys.exit(1) + + +def clone_dataset_repo(url, destination): + if not shutil.which("git"): + print("❌ Error: Git executable not found. Please install Git.", file=sys.stderr) + sys.exit(1) + + try: + if destination.exists() or destination.is_symlink(): + print(f"Destination '{destination}' already exists. Removing it...") + if destination.is_dir(): + shutil.rmtree(destination) + else: + destination.unlink() + + auth_url = url.replace("https://huggingface.co/", f"https://user:{HF_TOKEN}@huggingface.co/", 1) + print(f"Cloning {url} into {destination}...") + subprocess.run(["git", "clone", auth_url, destination], check=True, capture_output=True) + + print("✅ Git clone is successful.") + + except subprocess.CalledProcessError as e: + print("❌ Git command failed:", file=sys.stderr) + cmd = [url if i == 2 else arg for i, arg in enumerate(e.cmd)] + print(f" Command: {' '.join(map(str, cmd))}", file=sys.stderr) + stderr = e.stderr.decode().strip() + stderr = stderr.replace(HF_TOKEN, "***") if HF_TOKEN else stderr + print(f" Stderr: {stderr}", file=sys.stderr) + sys.exit(1) + + +if __name__ == "__main__": + data_dir = Path(__file__).absolute().parent + data_dir.mkdir(exist_ok=True) + destination = data_dir / "OJBench_testdata" + clone_dataset_repo(REPO_URL, destination) + + source_file = destination / "prompts" / "full.jsonl" + python_target_file = data_dir / "test_python.jsonl" + cpp_target_file = data_dir / "test_cpp.jsonl" + + print(f"Processing '{source_file}' and splitting into Python and C++ subsets...") + processed_lines = 0 + try: + with ( + source_file.open("r", encoding="utf-8") as infile, + python_target_file.open("w", encoding="utf-8") as outfile_py, + cpp_target_file.open("w", encoding="utf-8") as outfile_cpp, + ): + for line in infile: + data = json.loads(line) + data["question"] = data.pop("prompt") + data["subset_for_metrics"] = data["difficulty"] + if data["language"] == "python": + outfile_py.write(json.dumps(data) + "\n") + elif data["language"] == "cpp": + outfile_cpp.write(json.dumps(data) + "\n") + processed_lines += 1 + print(f"✅ Successfully processed {processed_lines} lines.") + + except (FileNotFoundError, json.JSONDecodeError, OSError) as e: + print(f"❌ Error during file processing: {e}", file=sys.stderr) + sys.exit(1) diff --git a/nemo_skills/evaluation/evaluator/__init__.py b/nemo_skills/evaluation/evaluator/__init__.py index a45bf9e36d..7bdbfda6ea 100644 --- a/nemo_skills/evaluation/evaluator/__init__.py +++ b/nemo_skills/evaluation/evaluator/__init__.py @@ -34,6 +34,7 @@ ) from nemo_skills.evaluation.evaluator.mcq import eval_mcq from nemo_skills.evaluation.evaluator.mrcr import eval_mrcr +from nemo_skills.evaluation.evaluator.ojbench import eval_ojbench from nemo_skills.evaluation.evaluator.ruler import eval_ruler from nemo_skills.evaluation.evaluator.scicode import eval_scicode @@ -58,6 +59,7 @@ def dummy_eval(cfg): "mrcr": eval_mrcr, "ioi": eval_ioi, "bigcodebench": eval_bigcodebench, + "ojbench": eval_ojbench, } # Evaluator class mapping diff --git a/nemo_skills/evaluation/evaluator/ojbench.py b/nemo_skills/evaluation/evaluator/ojbench.py new file mode 100644 index 0000000000..889409196c --- /dev/null +++ b/nemo_skills/evaluation/evaluator/ojbench.py @@ -0,0 +1,138 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +import asyncio +import json +import logging +import shlex +import textwrap +from contextlib import asynccontextmanager +from dataclasses import field +from pathlib import Path + +from nemo_skills.code_execution.sandbox import get_sandbox +from nemo_skills.evaluation.evaluator.code import preprocess_code +from nemo_skills.utils import get_logger_name, nested_dataclass, unroll_files + +LOG = logging.getLogger(get_logger_name(__file__)) + + +@nested_dataclass(kw_only=True) +class OJBenchConfig: + sandbox: dict = field(default_factory=lambda: {"sandbox_type": "local"}) + timeout: int = 6 + + +@asynccontextmanager +async def sandbox_context(config: dict): + sandbox = get_sandbox(**config) + try: + yield sandbox + finally: + LOG.info("Closing sandbox...") + await sandbox.close() + + +async def install_packages(eval_config: OJBenchConfig) -> bool: + """Helper to install packages inside the sandbox.""" + + async with sandbox_context(eval_config.sandbox) as sandbox: + LOG.info("Installing required packages for ojbench evaluation...") + + clone_cmd = "git clone https://github.com/He-Ren/OJBench.git" + result, _ = await sandbox.execute_code(clone_cmd, language="shell", timeout=300) + if result["process_status"] != "completed": + stderr = result.get("stderr", "Unknown error") + raise RuntimeError(f"Failed to clone OJBench repo: {stderr}") + + install_cmd = "pip install -e OJBench" + result, _ = await sandbox.execute_code(install_cmd, language="shell", timeout=300) + if result["process_status"] != "completed": + stderr = result.get("stderr", "Unknown error") + raise RuntimeError(f"Failed to install ojbench. Stderr: {stderr}") + + LOG.info("Successfully installed ojbench.") + + +async def eval_ojbench_async(cfg): + eval_config = OJBenchConfig(**cfg.eval_config) + problem_dirs = [ + Path(cfg.data_dir, "ojbench/OJBench_testdata/NOI"), + Path(cfg.data_dir, "ojbench/OJBench_testdata/ICPC"), + ] + + await install_packages(eval_config) + + async with sandbox_context(eval_config.sandbox) as sandbox: + for jsonl_file_str in unroll_files(cfg.input_files): + jsonl_file = Path(jsonl_file_str) + with open(jsonl_file, encoding="utf-8") as f_in: + samples = [] + for line in f_in: + sample = json.loads(line) + sample = preprocess_code(sample, sample["language"], strip_whitespace=True) + sample["prompt"] = sample.pop("question") + sample["content"] = f"```{sample['language']}\n{sample['completion']}\n```" + sample.pop("completion") + samples.append(sample) + + input_filename = jsonl_file.name.replace("output-", "eval-input-", 1) + eval_input_file = jsonl_file.with_name(input_filename) + results_filename = jsonl_file.name.replace("output-", "eval-results-", 1) + eval_results_file = jsonl_file.with_name(results_filename) + + with open(eval_input_file, "w", encoding="utf-8") as f_out: + f_out.writelines(json.dumps(sample) + "\n" for sample in samples) + + eval_code = textwrap.dedent(f""" + import ojbench + ojbench.init(problem_dirs={repr([str(p) for p in problem_dirs])}) + ojbench.judge_jsonl( + input_path={repr(str(eval_input_file))}, + output_path={repr(str(eval_results_file))}, + num_workers=16 + ) + """) + + cmd = f'env -i PATH="/usr/local/bin:/usr/bin:/bin" python3 -c {shlex.quote(eval_code)}' + output, _ = await sandbox.execute_code( + cmd, + language="shell", + timeout=eval_config.timeout * len(samples) + 60, + max_output_characters=100_000, + ) + + if output.get("process_status") != "completed": + raise RuntimeError(f"Evaluation failed for {jsonl_file}. Stderr: {output.get('stderr')}") + + with open(eval_results_file, "rt", encoding="utf-8") as fin: + results = [json.loads(line) for line in fin] + + if len(results) != len(samples): + LOG.error(f"Result count mismatch for {jsonl_file}: {len(results)} results vs {len(samples)} samples") + continue + + for sample, result in zip(samples, results, strict=True): + sample["verdict"] = result["verdict"] + sample["is_passed"] = result["is_passed"] + + with open(jsonl_file, "w", encoding="utf-8") as f: + for sample in samples: + f.write(json.dumps(sample) + "\n") + + +def eval_ojbench(cfg): + """Synchronous wrapper to run the async evaluation.""" + asyncio.run(eval_ojbench_async(cfg)) diff --git a/nemo_skills/evaluation/metrics/code_metrics.py b/nemo_skills/evaluation/metrics/code_metrics.py index ec116d08d5..ceb6c11388 100644 --- a/nemo_skills/evaluation/metrics/code_metrics.py +++ b/nemo_skills/evaluation/metrics/code_metrics.py @@ -109,3 +109,15 @@ def get_incorrect_sample(self, prediction: dict) -> dict: def update(self, predictions): super().update(predictions) self._compute_pass_at_k(predictions=predictions) + + +class OJBenchMetrics(BaseMetrics): + def _get_score_dict(self, prediction: dict) -> dict[str, bool | int | float]: + return {"accuracy": prediction["is_passed"]} + + def get_incorrect_sample(self, prediction: dict) -> dict: + return {"is_passed": False} + + def update(self, predictions): + super().update(predictions) + self._compute_pass_at_k(predictions=predictions) diff --git a/nemo_skills/evaluation/metrics/map_metrics.py b/nemo_skills/evaluation/metrics/map_metrics.py index 2bee361c37..304899130d 100644 --- a/nemo_skills/evaluation/metrics/map_metrics.py +++ b/nemo_skills/evaluation/metrics/map_metrics.py @@ -21,6 +21,7 @@ BigCodeBenchMetrics, EvalPlusMetrics, LiveCodeBenchMetrics, + OJBenchMetrics, SciCodeMetrics, SweBenchMetrics, ) @@ -53,6 +54,7 @@ "mrcr": MRCRMetrics, "aalcr": AALCRMetrics, "livebench_coding": LiveCodeBenchMetrics, + "ojbench": OJBenchMetrics, } diff --git a/nemo_skills/pipeline/eval.py b/nemo_skills/pipeline/eval.py index 2096493fae..e1634a886f 100644 --- a/nemo_skills/pipeline/eval.py +++ b/nemo_skills/pipeline/eval.py @@ -171,6 +171,10 @@ def eval( False, help="If True, will re-run jobs even if a corresponding '.done' file already exists" ), with_sandbox: bool = typer.Option(False, help="If True, will start a sandbox container alongside this job"), + keep_mounts_for_sandbox: bool = typer.Option( + False, + help="If True, will keep the mounts for the sandbox container. Note that, it is risky given that sandbox executes LLM commands and could potentially lead to data loss. So, we advise not to use this unless absolutely necessary.", + ), check_mounted_paths: bool = typer.Option(False, help="Check if mounted paths are available on the remote machine"), log_samples: bool = typer.Option( False, @@ -308,6 +312,7 @@ def eval( extra_datasets_type, exclusive, with_sandbox, + keep_mounts_for_sandbox, wandb_parameters, extra_eval_args, eval_requires_judge=eval_requires_judge, @@ -325,9 +330,15 @@ def eval( with pipeline_utils.get_exp(expname, cluster_config, _reuse_exp) as exp: # scheduling main eval jobs for idx, job_args in enumerate(job_batches): - cmds, job_benchmarks, job_needs_sandbox, job_server_config, job_server_address, job_server_command = ( - job_args - ) + ( + cmds, + job_benchmarks, + job_needs_sandbox, + job_needs_sandbox_to_keep_mounts, + job_server_config, + job_server_address, + job_server_command, + ) = job_args prev_tasks = _task_dependencies for _ in range(dependent_jobs + 1): @@ -343,6 +354,7 @@ def eval( time_min=time_min, server_config=job_server_config, with_sandbox=job_needs_sandbox or with_sandbox, + keep_mounts_for_sandbox=job_needs_sandbox_to_keep_mounts or keep_mounts_for_sandbox, sandbox_port=None if get_random_port else 6000, run_after=run_after, reuse_code_exp=reuse_code_exp, @@ -409,6 +421,7 @@ def eval( partition=partition, time_min=time_min, with_sandbox=with_sandbox, + keep_mounts_for_sandbox=keep_mounts_for_sandbox, run_after=run_after, reuse_code_exp=reuse_code_exp, reuse_code=reuse_code, diff --git a/nemo_skills/pipeline/generate.py b/nemo_skills/pipeline/generate.py index ac5a8dfd25..aa469bfb09 100644 --- a/nemo_skills/pipeline/generate.py +++ b/nemo_skills/pipeline/generate.py @@ -130,6 +130,10 @@ def generate( False, help="If True, will re-run jobs even if a corresponding '.done' file already exists" ), with_sandbox: bool = typer.Option(False, help="If True, will start a sandbox container alongside this job"), + keep_mounts_for_sandbox: bool = typer.Option( + False, + help="If True, will keep the mounts for the sandbox container. Note that, it is risky given that sandbox executes LLM commands and could potentially lead to data loss. So, we advise not to use this unless absolutely necessary.", + ), check_mounted_paths: bool = typer.Option(False, help="Check if mounted paths are available on the remote machine"), log_samples: bool = typer.Option( False, @@ -309,6 +313,7 @@ def generate( time_min=time_min, server_config=server_config, with_sandbox=with_sandbox, + keep_mounts_for_sandbox=keep_mounts_for_sandbox, sandbox_port=None if get_random_port else 6000, run_after=run_after, reuse_code=reuse_code, diff --git a/nemo_skills/pipeline/prepare_data.py b/nemo_skills/pipeline/prepare_data.py index f271c2be8f..0d5e40bfc0 100644 --- a/nemo_skills/pipeline/prepare_data.py +++ b/nemo_skills/pipeline/prepare_data.py @@ -27,7 +27,7 @@ # TODO: read this from init.py -DATASETS_REQUIRE_DATA_DIR = ["ruler", "ioi24"] +DATASETS_REQUIRE_DATA_DIR = ["ruler", "ioi24", "ojbench"] @app.command(context_settings={"allow_extra_args": True, "ignore_unknown_options": True}) @@ -55,6 +55,10 @@ def prepare_data( reuse_code_exp: str = typer.Option(None, help="Experiment to reuse code from"), config_dir: str = typer.Option(None, help="Custom cluster config directory"), with_sandbox: bool = typer.Option(False, help="Start a sandbox container alongside"), + keep_mounts_for_sandbox: bool = typer.Option( + False, + help="If True, will keep the mounts for the sandbox container. Note that, it is risky given that sandbox executes LLM commands and could potentially lead to data loss. So, we advise not to use this unless absolutely necessary.", + ), log_dir: str = typer.Option(None, help="Custom location for slurm logs"), exclusive: bool = typer.Option(False, help="If set will add exclusive flag to the slurm job."), check_mounted_paths: bool = typer.Option(False, help="Check mounted paths availability"), @@ -139,6 +143,7 @@ def prepare_data( reuse_code_exp=reuse_code_exp, config_dir=config_dir, with_sandbox=with_sandbox, + keep_mounts_for_sandbox=keep_mounts_for_sandbox, log_dir=log_dir, exclusive=exclusive, check_mounted_paths=check_mounted_paths, diff --git a/nemo_skills/pipeline/run_cmd.py b/nemo_skills/pipeline/run_cmd.py index f4f217bb92..57b01186ed 100644 --- a/nemo_skills/pipeline/run_cmd.py +++ b/nemo_skills/pipeline/run_cmd.py @@ -88,6 +88,10 @@ def run_cmd( ), config_dir: str = typer.Option(None, help="Can customize where we search for cluster configs"), with_sandbox: bool = typer.Option(False, help="If True, will start a sandbox container alongside this job"), + keep_mounts_for_sandbox: bool = typer.Option( + False, + help="If True, will keep the mounts for the sandbox container. Note that, it is risky given that sandbox executes LLM commands and could potentially lead to data loss. So, we advise not to use this unless absolutely necessary.", + ), log_dir: str = typer.Option( None, help="Can specify a custom location for slurm logs. " @@ -184,6 +188,7 @@ def run_cmd( time_min=time_min, server_config=server_config, with_sandbox=with_sandbox, + keep_mounts_for_sandbox=keep_mounts_for_sandbox, sandbox_port=None if get_random_port else 6000, run_after=run_after, reuse_code=reuse_code, diff --git a/nemo_skills/pipeline/start_server.py b/nemo_skills/pipeline/start_server.py index af9ee1c4ac..82470e5134 100644 --- a/nemo_skills/pipeline/start_server.py +++ b/nemo_skills/pipeline/start_server.py @@ -62,6 +62,10 @@ def start_server( with_sandbox: bool = typer.Option( False, help="Starts a sandbox (set this flag if model supports calling Python interpreter)" ), + keep_mounts_for_sandbox: bool = typer.Option( + False, + help="If True, will keep the mounts for the sandbox container. Note that, it is risky given that sandbox executes LLM commands and could potentially lead to data loss. So, we advise not to use this unless absolutely necessary.", + ), launch_chat_interface: bool = typer.Option( False, help="If True, will launch a gradio app that provides chat with the model" ), @@ -119,6 +123,7 @@ def start_server( time_min=time_min, server_config=server_config, with_sandbox=with_sandbox, + keep_mounts_for_sandbox=keep_mounts_for_sandbox, sandbox_port=None if get_random_port else 6000, slurm_kwargs={"exclusive": exclusive} if exclusive else None, ) diff --git a/nemo_skills/pipeline/train.py b/nemo_skills/pipeline/train.py index 15446706f9..29a817b0c1 100755 --- a/nemo_skills/pipeline/train.py +++ b/nemo_skills/pipeline/train.py @@ -247,6 +247,10 @@ def train( wandb_project: str = typer.Option("nemo-skills", help="Weights & Biases project name"), disable_wandb: bool = typer.Option(False, help="Disable wandb logging"), with_sandbox: bool = typer.Option(False, help="If sandbox is required for code generation"), + keep_mounts_for_sandbox: bool = typer.Option( + False, + help="If True, will keep the mounts for the sandbox container. Note that, it is risky given that sandbox executes LLM commands and could potentially lead to data loss. So, we advise not to use this unless absolutely necessary.", + ), partition: str = typer.Option(None, help="Specify partition for jobs"), time_min: str = typer.Option(None, help="If specified, will use as a time-min slurm parameter"), average_steps: str = typer.Option( @@ -377,6 +381,7 @@ def train( partition=partition, time_min=time_min, with_sandbox=with_sandbox, + keep_mounts_for_sandbox=keep_mounts_for_sandbox, run_after=run_after, reuse_code=reuse_code, reuse_code_exp=reuse_code_exp, diff --git a/nemo_skills/pipeline/utils/eval.py b/nemo_skills/pipeline/utils/eval.py index 146049d951..071be72505 100644 --- a/nemo_skills/pipeline/utils/eval.py +++ b/nemo_skills/pipeline/utils/eval.py @@ -93,6 +93,7 @@ class BenchmarkArgs: judge_args: str judge_pipeline_args: dict requires_sandbox: bool + keep_mounts_for_sandbox: bool generation_module: str num_samples: int num_chunks: int | None @@ -180,6 +181,9 @@ def get_benchmark_args_from_module( if prompt_config: generation_args = f"++prompt_config={prompt_config} {generation_args}" requires_sandbox = get_arg_from_module_or_dict(benchmark_module, "REQUIRES_SANDBOX", False, override_dict) + keep_mounts_for_sandbox = get_arg_from_module_or_dict( + benchmark_module, "KEEP_MOUNTS_FOR_SANDBOX", False, override_dict + ) generation_module = get_arg_from_module_or_dict( benchmark_module, "GENERATION_MODULE", "nemo_skills.inference.generate", override_dict @@ -221,6 +225,7 @@ def get_benchmark_args_from_module( judge_args=judge_args, judge_pipeline_args=judge_pipeline_args, requires_sandbox=requires_sandbox, + keep_mounts_for_sandbox=keep_mounts_for_sandbox, generation_module=generation_module, num_samples=num_samples, num_chunks=num_chunks, @@ -304,6 +309,7 @@ def prepare_eval_commands( extra_datasets_type, exclusive, with_sandbox, + keep_mounts_for_sandbox, wandb_parameters, extra_eval_args, eval_requires_judge, @@ -365,6 +371,9 @@ def prepare_eval_commands( if benchmark_args.requires_sandbox and not with_sandbox: LOG.warning("Found benchmark (%s) which requires sandbox, enabled sandbox for it.", benchmark) + if benchmark_args.requires_sandbox and not keep_mounts_for_sandbox: + LOG.warning("Found benchmark (%s) which requires sandbox to keep mounts, enabling it.", benchmark) + total_evals = 0 for benchmark, benchmark_args in benchmarks_dict.items(): if benchmark_args.num_samples == 0: @@ -505,12 +514,16 @@ def prepare_eval_commands( if cur_eval == total_evals - 1 or cur_job_idx != eval_to_job_map[cur_eval + 1]: job_needs_sandbox = any(benchmarks_dict[b].requires_sandbox for b in job_benchmarks) + job_needs_sandbox_to_keep_mounts = any( + benchmarks_dict[b].keep_mounts_for_sandbox for b in job_benchmarks + ) # TODO: move to a dataclass job_batches.append( ( job_cmds, job_benchmarks, job_needs_sandbox, + job_needs_sandbox_to_keep_mounts, job_server_config, job_server_address, # a check above guarantees that this is the same for all tasks in a job diff --git a/nemo_skills/pipeline/utils/exp.py b/nemo_skills/pipeline/utils/exp.py index f08c2beba1..a27ffcba8c 100644 --- a/nemo_skills/pipeline/utils/exp.py +++ b/nemo_skills/pipeline/utils/exp.py @@ -343,6 +343,7 @@ def add_task( partition=None, time_min=None, with_sandbox=False, + keep_mounts_for_sandbox=False, sandbox_port: int | None = None, server_config=None, reuse_code_exp: str | run.Experiment | None = None, @@ -527,7 +528,7 @@ def add_task( gpus_per_node=0, partition=partition, time_min=time_min, - mounts=[], # we don't want to mount anything + mounts=None if keep_mounts_for_sandbox else [], dependencies=dependencies, job_name=task_name, log_dir=log_dir,