From c6ebec2630c3eee699b4bc83f1b841a1c9152032 Mon Sep 17 00:00:00 2001 From: Alex Filby Date: Mon, 30 Mar 2026 17:28:03 -0500 Subject: [PATCH 1/4] Set nemo vars in slurm container-env field. Additionally adds a flag for users to flag their own vars that need override. Signed-off-by: Alex Filby --- scripts/performance/argument_parser.py | 8 ++++++++ scripts/performance/setup_experiment.py | 3 +++ scripts/performance/utils/executors.py | 2 ++ 3 files changed, 13 insertions(+) diff --git a/scripts/performance/argument_parser.py b/scripts/performance/argument_parser.py index 06193ff840..439c3065c2 100644 --- a/scripts/performance/argument_parser.py +++ b/scripts/performance/argument_parser.py @@ -437,6 +437,14 @@ def parse_cli_args(): help="Comma separated string of environment variables", default={}, ) + slurm_args.add_argument( + "--container_env", + type=list_of_strings, + metavar="KEY[,KEY2,...]", + help="Comma-separated list of environment variable names that should override same-named " + "values from the container image. Use -E/--env or -ce/--custom_env_vars to set the value explicitly.", + default=[], + ) slurm_args.add_argument( "-E", "--env", diff --git a/scripts/performance/setup_experiment.py b/scripts/performance/setup_experiment.py index 2598c58b39..df153f439c 100755 --- a/scripts/performance/setup_experiment.py +++ b/scripts/performance/setup_experiment.py @@ -242,6 +242,7 @@ def main( dgxc_pvc_mount_path: str, config_variant: str = "v1", gres: Optional[str] = None, + container_env: Optional[List[str]] = None, ): """Sets up the experiment and runs it.""" if ( @@ -329,6 +330,7 @@ def main( nemo_home=nemo_home, additional_slurm_params=additional_slurm_params, wandb_key=wandb_key, + container_env=container_env or [], ) else: executor = dgxc_executor( @@ -668,4 +670,5 @@ def main( dgxc_pvc_mount_path=args.dgxc_pvc_mount_path, config_variant=config_variant, gres=args.gres, + container_env=args.container_env, ) diff --git a/scripts/performance/utils/executors.py b/scripts/performance/utils/executors.py index a1ba8fac2b..81ecc33a0c 100644 --- a/scripts/performance/utils/executors.py +++ b/scripts/performance/utils/executors.py @@ -60,6 +60,7 @@ def slurm_executor( custom_mounts: List[str] = [], custom_env_vars: Dict[str, str] = {}, custom_srun_args: List[str] = [], + container_env: List[str] = [], hf_token: str = None, nemo_home: str = DEFAULT_NEMO_HOME, wandb_key: str = None, @@ -144,6 +145,7 @@ def slurm_executor( container_image=container_image, container_mounts=mounts, env_vars=PERF_ENV_VARS, + container_env=sorted(set(PERF_ENV_VARS.keys()) | set(container_env)), srun_args=srun_args, time=time_limit, mem="0", From 0ea3fc1a3b64d02a64c751403a68f95e6910603f Mon Sep 17 00:00:00 2001 From: Alex Filby Date: Tue, 31 Mar 2026 19:20:17 -0500 Subject: [PATCH 2/4] Add unit test for container-env. Signed-off-by: Alex Filby --- .../scripts/performance/test_executors.py | 70 +++++++++++++++++++ 1 file changed, 70 insertions(+) create mode 100644 tests/unit_tests/scripts/performance/test_executors.py diff --git a/tests/unit_tests/scripts/performance/test_executors.py b/tests/unit_tests/scripts/performance/test_executors.py new file mode 100644 index 0000000000..3ced34fb66 --- /dev/null +++ b/tests/unit_tests/scripts/performance/test_executors.py @@ -0,0 +1,70 @@ +# Copyright (c) 2026, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Tests for scripts/performance/utils/executors.py — container_env on SlurmExecutor.""" + +import sys +from pathlib import Path + +import pytest + +# scripts/performance is not an installed package; add it to sys.path so we +# can import ``utils.executors`` the same way the scripts themselves do. +_PERF_SCRIPTS_DIR = Path(__file__).resolve().parents[4] / "scripts" / "performance" +if str(_PERF_SCRIPTS_DIR) not in sys.path: + sys.path.insert(0, str(_PERF_SCRIPTS_DIR)) + +try: + import nemo_run # noqa: F401 + + HAS_NEMO_RUN = True +except ImportError: + HAS_NEMO_RUN = False + +if HAS_NEMO_RUN: + from utils.executors import PERF_ENV_VARS, slurm_executor + + +@pytest.mark.skipif(not HAS_NEMO_RUN, reason="nemo_run not installed") +def test_container_env_includes_perf_vars(tmp_path): + """PERF_ENV_VARS keys must appear in container_env so they override container defaults.""" + executor = slurm_executor( + gpu="h100", account="test", partition="test", + log_dir=str(tmp_path), nodes=1, num_gpus_per_node=8, + ) + assert executor.container_env is not None, "container_env is None — was the field removed from the executor?" + missing = set(PERF_ENV_VARS) - set(executor.container_env) + assert not missing, f"PERF_ENV_VARS keys missing from container_env: {missing}" + + +@pytest.mark.skipif(not HAS_NEMO_RUN, reason="nemo_run not installed") +def test_custom_env_vars_in_container_env(tmp_path): + """Vars passed via custom_env_vars must also appear in container_env.""" + executor = slurm_executor( + gpu="h100", account="test", partition="test", + log_dir=str(tmp_path), nodes=1, num_gpus_per_node=8, + custom_env_vars={"MY_CUSTOM_VAR": "1"}, + ) + assert "MY_CUSTOM_VAR" in executor.container_env + + +@pytest.mark.skipif(not HAS_NEMO_RUN, reason="nemo_run not installed") +def test_container_env_param_forwarded(tmp_path): + """Keys passed via the container_env parameter must appear in container_env.""" + executor = slurm_executor( + gpu="h100", account="test", partition="test", + log_dir=str(tmp_path), nodes=1, num_gpus_per_node=8, + container_env=["UPSTREAM_SET_VAR"], + ) + assert "UPSTREAM_SET_VAR" in executor.container_env From 653b67b1e4be773b39c5e3a46348a64b4b3c5f07 Mon Sep 17 00:00:00 2001 From: Alex Filby Date: Tue, 31 Mar 2026 21:38:58 -0500 Subject: [PATCH 3/4] Port perf_env change from #2847 Otherwise depending on merge order the container_env field ends up still pointing to PERF_ENV_VARS. Signed-off-by: Alex Filby --- scripts/performance/utils/executors.py | 18 ++++++++++-------- 1 file changed, 10 insertions(+), 8 deletions(-) diff --git a/scripts/performance/utils/executors.py b/scripts/performance/utils/executors.py index 81ecc33a0c..e12b24477f 100644 --- a/scripts/performance/utils/executors.py +++ b/scripts/performance/utils/executors.py @@ -97,20 +97,22 @@ def slurm_executor( f"Logs will be written to {get_nemorun_home()}, which is probably not desired. export NEMORUN_HOME in your shell environment or use the --log_dir argument" ) + perf_env = PERF_ENV_VARS.copy() + if wandb_key is not None: - PERF_ENV_VARS["WANDB_API_KEY"] = wandb_key + perf_env["WANDB_API_KEY"] = wandb_key if gpu.lower() == "gb200": - PERF_ENV_VARS["NCCL_NET_GDR_LEVEL"] = "PHB" # For NCCL 2.25 - PERF_ENV_VARS["NCCL_NET_GDR_C2C"] = "1" # For NCCL 2.26 + perf_env["NCCL_NET_GDR_LEVEL"] = "PHB" # For NCCL 2.25 + perf_env["NCCL_NET_GDR_C2C"] = "1" # For NCCL 2.26 if nemo_home != DEFAULT_NEMO_CACHE_HOME: # DO NOT change this to 'DEFAULT_NEMO_HOME'/'NEMO_HOME' - PERF_ENV_VARS["NEMO_HOME"] = nemo_home + perf_env["NEMO_HOME"] = nemo_home mounts.extend([f"{nemo_home}:{nemo_home}"]) if hf_token is not None: - PERF_ENV_VARS.update({"HF_TOKEN": hf_token, "TRANSFORMERS_OFFLINE": "0"}) + perf_env.update({"HF_TOKEN": hf_token, "TRANSFORMERS_OFFLINE": "0"}) - PERF_ENV_VARS.update(custom_env_vars) + perf_env.update(custom_env_vars) mounts.extend(custom_mounts) # add --segment flag to sbatch if job uses GB200. @@ -144,8 +146,8 @@ def slurm_executor( gres=gres, container_image=container_image, container_mounts=mounts, - env_vars=PERF_ENV_VARS, - container_env=sorted(set(PERF_ENV_VARS.keys()) | set(container_env)), + env_vars=perf_env, + container_env=sorted(set(perf_env.keys()) | set(container_env)), srun_args=srun_args, time=time_limit, mem="0", From 35a11b4d8fd47ef287051800753dc706202d62ac Mon Sep 17 00:00:00 2001 From: Alex Filby Date: Thu, 2 Apr 2026 16:58:14 -0500 Subject: [PATCH 4/4] Remove --container-env flag. Signed-off-by: Alex Filby --- scripts/performance/argument_parser.py | 8 -------- scripts/performance/setup_experiment.py | 3 --- scripts/performance/utils/executors.py | 3 +-- tests/unit_tests/scripts/performance/test_executors.py | 9 --------- 4 files changed, 1 insertion(+), 22 deletions(-) diff --git a/scripts/performance/argument_parser.py b/scripts/performance/argument_parser.py index 439c3065c2..06193ff840 100644 --- a/scripts/performance/argument_parser.py +++ b/scripts/performance/argument_parser.py @@ -437,14 +437,6 @@ def parse_cli_args(): help="Comma separated string of environment variables", default={}, ) - slurm_args.add_argument( - "--container_env", - type=list_of_strings, - metavar="KEY[,KEY2,...]", - help="Comma-separated list of environment variable names that should override same-named " - "values from the container image. Use -E/--env or -ce/--custom_env_vars to set the value explicitly.", - default=[], - ) slurm_args.add_argument( "-E", "--env", diff --git a/scripts/performance/setup_experiment.py b/scripts/performance/setup_experiment.py index df153f439c..2598c58b39 100755 --- a/scripts/performance/setup_experiment.py +++ b/scripts/performance/setup_experiment.py @@ -242,7 +242,6 @@ def main( dgxc_pvc_mount_path: str, config_variant: str = "v1", gres: Optional[str] = None, - container_env: Optional[List[str]] = None, ): """Sets up the experiment and runs it.""" if ( @@ -330,7 +329,6 @@ def main( nemo_home=nemo_home, additional_slurm_params=additional_slurm_params, wandb_key=wandb_key, - container_env=container_env or [], ) else: executor = dgxc_executor( @@ -670,5 +668,4 @@ def main( dgxc_pvc_mount_path=args.dgxc_pvc_mount_path, config_variant=config_variant, gres=args.gres, - container_env=args.container_env, ) diff --git a/scripts/performance/utils/executors.py b/scripts/performance/utils/executors.py index e12b24477f..8e012845cc 100644 --- a/scripts/performance/utils/executors.py +++ b/scripts/performance/utils/executors.py @@ -60,7 +60,6 @@ def slurm_executor( custom_mounts: List[str] = [], custom_env_vars: Dict[str, str] = {}, custom_srun_args: List[str] = [], - container_env: List[str] = [], hf_token: str = None, nemo_home: str = DEFAULT_NEMO_HOME, wandb_key: str = None, @@ -147,7 +146,7 @@ def slurm_executor( container_image=container_image, container_mounts=mounts, env_vars=perf_env, - container_env=sorted(set(perf_env.keys()) | set(container_env)), + container_env=sorted(perf_env.keys()), srun_args=srun_args, time=time_limit, mem="0", diff --git a/tests/unit_tests/scripts/performance/test_executors.py b/tests/unit_tests/scripts/performance/test_executors.py index 3ced34fb66..a03d12bac8 100644 --- a/tests/unit_tests/scripts/performance/test_executors.py +++ b/tests/unit_tests/scripts/performance/test_executors.py @@ -59,12 +59,3 @@ def test_custom_env_vars_in_container_env(tmp_path): assert "MY_CUSTOM_VAR" in executor.container_env -@pytest.mark.skipif(not HAS_NEMO_RUN, reason="nemo_run not installed") -def test_container_env_param_forwarded(tmp_path): - """Keys passed via the container_env parameter must appear in container_env.""" - executor = slurm_executor( - gpu="h100", account="test", partition="test", - log_dir=str(tmp_path), nodes=1, num_gpus_per_node=8, - container_env=["UPSTREAM_SET_VAR"], - ) - assert "UPSTREAM_SET_VAR" in executor.container_env