diff --git a/scripts/performance/utils/executors.py b/scripts/performance/utils/executors.py index a1ba8fac2b..8e012845cc 100644 --- a/scripts/performance/utils/executors.py +++ b/scripts/performance/utils/executors.py @@ -96,20 +96,22 @@ def slurm_executor( f"Logs will be written to {get_nemorun_home()}, which is probably not desired. export NEMORUN_HOME in your shell environment or use the --log_dir argument" ) + perf_env = PERF_ENV_VARS.copy() + if wandb_key is not None: - PERF_ENV_VARS["WANDB_API_KEY"] = wandb_key + perf_env["WANDB_API_KEY"] = wandb_key if gpu.lower() == "gb200": - PERF_ENV_VARS["NCCL_NET_GDR_LEVEL"] = "PHB" # For NCCL 2.25 - PERF_ENV_VARS["NCCL_NET_GDR_C2C"] = "1" # For NCCL 2.26 + perf_env["NCCL_NET_GDR_LEVEL"] = "PHB" # For NCCL 2.25 + perf_env["NCCL_NET_GDR_C2C"] = "1" # For NCCL 2.26 if nemo_home != DEFAULT_NEMO_CACHE_HOME: # DO NOT change this to 'DEFAULT_NEMO_HOME'/'NEMO_HOME' - PERF_ENV_VARS["NEMO_HOME"] = nemo_home + perf_env["NEMO_HOME"] = nemo_home mounts.extend([f"{nemo_home}:{nemo_home}"]) if hf_token is not None: - PERF_ENV_VARS.update({"HF_TOKEN": hf_token, "TRANSFORMERS_OFFLINE": "0"}) + perf_env.update({"HF_TOKEN": hf_token, "TRANSFORMERS_OFFLINE": "0"}) - PERF_ENV_VARS.update(custom_env_vars) + perf_env.update(custom_env_vars) mounts.extend(custom_mounts) # add --segment flag to sbatch if job uses GB200. @@ -143,7 +145,8 @@ def slurm_executor( gres=gres, container_image=container_image, container_mounts=mounts, - env_vars=PERF_ENV_VARS, + env_vars=perf_env, + container_env=sorted(perf_env.keys()), srun_args=srun_args, time=time_limit, mem="0", diff --git a/tests/unit_tests/scripts/performance/test_executors.py b/tests/unit_tests/scripts/performance/test_executors.py new file mode 100644 index 0000000000..a03d12bac8 --- /dev/null +++ b/tests/unit_tests/scripts/performance/test_executors.py @@ -0,0 +1,61 @@ +# Copyright (c) 2026, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Tests for scripts/performance/utils/executors.py — container_env on SlurmExecutor.""" + +import sys +from pathlib import Path + +import pytest + +# scripts/performance is not an installed package; add it to sys.path so we +# can import ``utils.executors`` the same way the scripts themselves do. +_PERF_SCRIPTS_DIR = Path(__file__).resolve().parents[4] / "scripts" / "performance" +if str(_PERF_SCRIPTS_DIR) not in sys.path: + sys.path.insert(0, str(_PERF_SCRIPTS_DIR)) + +try: + import nemo_run # noqa: F401 + + HAS_NEMO_RUN = True +except ImportError: + HAS_NEMO_RUN = False + +if HAS_NEMO_RUN: + from utils.executors import PERF_ENV_VARS, slurm_executor + + +@pytest.mark.skipif(not HAS_NEMO_RUN, reason="nemo_run not installed") +def test_container_env_includes_perf_vars(tmp_path): + """PERF_ENV_VARS keys must appear in container_env so they override container defaults.""" + executor = slurm_executor( + gpu="h100", account="test", partition="test", + log_dir=str(tmp_path), nodes=1, num_gpus_per_node=8, + ) + assert executor.container_env is not None, "container_env is None — was the field removed from the executor?" + missing = set(PERF_ENV_VARS) - set(executor.container_env) + assert not missing, f"PERF_ENV_VARS keys missing from container_env: {missing}" + + +@pytest.mark.skipif(not HAS_NEMO_RUN, reason="nemo_run not installed") +def test_custom_env_vars_in_container_env(tmp_path): + """Vars passed via custom_env_vars must also appear in container_env.""" + executor = slurm_executor( + gpu="h100", account="test", partition="test", + log_dir=str(tmp_path), nodes=1, num_gpus_per_node=8, + custom_env_vars={"MY_CUSTOM_VAR": "1"}, + ) + assert "MY_CUSTOM_VAR" in executor.container_env + +